Viewing file: client.py (24.06 KB) -rw-r--r-- Select action/file-type: (+) | (+) | (+) | Code (+) | Session (+) | (+) | SDB (+) | (+) | (+) | (+) | (+) | (+) |
# -*- test-case-name: twisted.web.test.test_webclient -*- # Copyright (c) 2001-2010 Twisted Matrix Laboratories. # See LICENSE for details.
""" HTTP client. """
import os, types from urlparse import urlunparse
from twisted.python import log from twisted.web import http from twisted.internet import defer, protocol, reactor from twisted.python import failure from twisted.python.util import InsensitiveDict from twisted.web import error from twisted.web.http_headers import Headers from twisted.python.compat import set
class PartialDownloadError(error.Error): """ Page was only partially downloaded, we got disconnected in middle.
@ivar response: All of the response body which was downloaded. """
class HTTPPageGetter(http.HTTPClient): """ Gets a resource via HTTP, then quits.
Typically used with L{HTTPClientFactory}. Note that this class does not, by itself, do anything with the response. If you want to download a resource into a file, use L{HTTPPageDownloader} instead. """
quietLoss = 0 followRedirect = True failed = 0
_specialHeaders = set(('host', 'user-agent', 'cookie', 'content-length'))
def connectionMade(self): method = getattr(self.factory, 'method', 'GET') self.sendCommand(method, self.factory.path) self.sendHeader('Host', self.factory.headers.get("host", self.factory.host)) self.sendHeader('User-Agent', self.factory.agent) data = getattr(self.factory, 'postdata', None) if data is not None: self.sendHeader("Content-Length", str(len(data)))
cookieData = [] for (key, value) in self.factory.headers.items(): if key.lower() not in self._specialHeaders: # we calculated it on our own self.sendHeader(key, value) if key.lower() == 'cookie': cookieData.append(value) for cookie, cookval in self.factory.cookies.items(): cookieData.append('%s=%s' % (cookie, cookval)) if cookieData: self.sendHeader('Cookie', '; '.join(cookieData)) self.endHeaders() self.headers = {}
if data is not None: self.transport.write(data)
def handleHeader(self, key, value): """ Called every time a header is received. Stores the header information as key-value pairs in the C{headers} attribute.
@type key: C{str} @param key: An HTTP header field name.
@type value: C{str} @param value: An HTTP header field value. """ key = key.lower() l = self.headers.setdefault(key, []) l.append(value)
def handleStatus(self, version, status, message): self.version, self.status, self.message = version, status, message self.factory.gotStatus(version, status, message)
def handleEndHeaders(self): self.factory.gotHeaders(self.headers) m = getattr(self, 'handleStatus_'+self.status, self.handleStatusDefault) m()
def handleStatus_200(self): pass
handleStatus_201 = lambda self: self.handleStatus_200() handleStatus_202 = lambda self: self.handleStatus_200()
def handleStatusDefault(self): self.failed = 1
def handleStatus_301(self): l = self.headers.get('location') if not l: self.handleStatusDefault() return url = l[0] if self.followRedirect: scheme, host, port, path = \ _parse(url, defaultPort=self.transport.getPeer().port)
self.factory._redirectCount += 1 if self.factory._redirectCount >= self.factory.redirectLimit: err = error.InfiniteRedirection( self.status, 'Infinite redirection detected', location=url) self.factory.noPage(failure.Failure(err)) self.quietLoss = True self.transport.loseConnection() return
self.factory.setURL(url)
if self.factory.scheme == 'https': from twisted.internet import ssl contextFactory = ssl.ClientContextFactory() reactor.connectSSL(self.factory.host, self.factory.port, self.factory, contextFactory) else: reactor.connectTCP(self.factory.host, self.factory.port, self.factory) else: self.handleStatusDefault() self.factory.noPage( failure.Failure( error.PageRedirect( self.status, self.message, location = url))) self.quietLoss = True self.transport.loseConnection()
def handleStatus_302(self): if self.afterFoundGet: self.handleStatus_303() self.handleStatus_301()
def handleStatus_303(self): self.factory.method = 'GET' self.handleStatus_301()
def connectionLost(self, reason): if not self.quietLoss: http.HTTPClient.connectionLost(self, reason) self.factory.noPage(reason)
def handleResponse(self, response): if self.quietLoss: return if self.failed: self.factory.noPage( failure.Failure( error.Error( self.status, self.message, response))) if self.factory.method == 'HEAD': # Callback with empty string, since there is never a response # body for HEAD requests. self.factory.page('') elif self.length != None and self.length != 0: self.factory.noPage(failure.Failure( PartialDownloadError(self.status, self.message, response))) else: self.factory.page(response) # server might be stupid and not close connection. admittedly # the fact we do only one request per connection is also # stupid... self.transport.loseConnection()
def timeout(self): self.quietLoss = True self.transport.loseConnection() self.factory.noPage(defer.TimeoutError("Getting %s took longer than %s seconds." % (self.factory.url, self.factory.timeout)))
class HTTPPageDownloader(HTTPPageGetter):
transmittingPage = 0
def handleStatus_200(self, partialContent=0): HTTPPageGetter.handleStatus_200(self) self.transmittingPage = 1 self.factory.pageStart(partialContent)
def handleStatus_206(self): self.handleStatus_200(partialContent=1)
def handleResponsePart(self, data): if self.transmittingPage: self.factory.pagePart(data)
def handleResponseEnd(self): if self.length: self.transmittingPage = 0 self.factory.noPage( failure.Failure( PartialDownloadError(self.status))) if self.transmittingPage: self.factory.pageEnd() self.transmittingPage = 0 if self.failed: self.factory.noPage( failure.Failure( error.Error( self.status, self.message, None))) self.transport.loseConnection()
class HTTPClientFactory(protocol.ClientFactory): """Download a given URL.
@type deferred: Deferred @ivar deferred: A Deferred that will fire when the content has been retrieved. Once this is fired, the ivars `status', `version', and `message' will be set.
@type status: str @ivar status: The status of the response.
@type version: str @ivar version: The version of the response.
@type message: str @ivar message: The text message returned with the status.
@type response_headers: dict @ivar response_headers: The headers that were specified in the response from the server.
@type method: str @ivar method: The HTTP method to use in the request. This should be one of OPTIONS, GET, HEAD, POST, PUT, DELETE, TRACE, or CONNECT (case matters). Other values may be specified if the server being contacted supports them.
@type redirectLimit: int @ivar redirectLimit: The maximum number of HTTP redirects that can occur before it is assumed that the redirection is endless.
@type afterFoundGet: C{bool} @ivar afterFoundGet: Deviate from the HTTP 1.1 RFC by handling redirects the same way as most web browsers; if the request method is POST and a 302 status is encountered, the redirect is followed with a GET method
@type _redirectCount: int @ivar _redirectCount: The current number of HTTP redirects encountered. """
protocol = HTTPPageGetter
url = None scheme = None host = '' port = None path = None
def __init__(self, url, method='GET', postdata=None, headers=None, agent="Twisted PageGetter", timeout=0, cookies=None, followRedirect=True, redirectLimit=20, afterFoundGet=False): self.followRedirect = followRedirect self.redirectLimit = redirectLimit self._redirectCount = 0 self.timeout = timeout self.agent = agent self.afterFoundGet = afterFoundGet if cookies is None: cookies = {} self.cookies = cookies if headers is not None: self.headers = InsensitiveDict(headers) else: self.headers = InsensitiveDict() if postdata is not None: self.headers.setdefault('Content-Length', len(postdata)) # just in case a broken http/1.1 decides to keep connection alive self.headers.setdefault("connection", "close") self.postdata = postdata self.method = method
self.setURL(url)
self.waiting = 1 self.deferred = defer.Deferred() self.response_headers = None
def __repr__(self): return "<%s: %s>" % (self.__class__.__name__, self.url)
def setURL(self, url): self.url = url scheme, host, port, path = _parse(url) if scheme and host: self.scheme = scheme self.host = host self.port = port self.path = path
def buildProtocol(self, addr): p = protocol.ClientFactory.buildProtocol(self, addr) p.followRedirect = self.followRedirect p.afterFoundGet = self.afterFoundGet if self.timeout: timeoutCall = reactor.callLater(self.timeout, p.timeout) self.deferred.addBoth(self._cancelTimeout, timeoutCall) return p
def _cancelTimeout(self, result, timeoutCall): if timeoutCall.active(): timeoutCall.cancel() return result
def gotHeaders(self, headers): self.response_headers = headers if headers.has_key('set-cookie'): for cookie in headers['set-cookie']: cookparts = cookie.split(';') cook = cookparts[0] cook.lstrip() k, v = cook.split('=', 1) self.cookies[k.lstrip()] = v.lstrip()
def gotStatus(self, version, status, message): self.version, self.status, self.message = version, status, message
def page(self, page): if self.waiting: self.waiting = 0 self.deferred.callback(page)
def noPage(self, reason): if self.waiting: self.waiting = 0 self.deferred.errback(reason)
def clientConnectionFailed(self, _, reason): if self.waiting: self.waiting = 0 self.deferred.errback(reason)
class HTTPDownloader(HTTPClientFactory): """Download to a file."""
protocol = HTTPPageDownloader value = None
def __init__(self, url, fileOrName, method='GET', postdata=None, headers=None, agent="Twisted client", supportPartial=0, timeout=0, cookies=None, followRedirect=1, redirectLimit=20): self.requestedPartial = 0 if isinstance(fileOrName, types.StringTypes): self.fileName = fileOrName self.file = None if supportPartial and os.path.exists(self.fileName): fileLength = os.path.getsize(self.fileName) if fileLength: self.requestedPartial = fileLength if headers == None: headers = {} headers["range"] = "bytes=%d-" % fileLength else: self.file = fileOrName HTTPClientFactory.__init__( self, url, method=method, postdata=postdata, headers=headers, agent=agent, timeout=timeout, cookies=cookies, followRedirect=followRedirect, redirectLimit=redirectLimit)
def gotHeaders(self, headers): HTTPClientFactory.gotHeaders(self, headers) if self.requestedPartial: contentRange = headers.get("content-range", None) if not contentRange: # server doesn't support partial requests, oh well self.requestedPartial = 0 return start, end, realLength = http.parseContentRange(contentRange[0]) if start != self.requestedPartial: # server is acting wierdly self.requestedPartial = 0
def openFile(self, partialContent): if partialContent: file = open(self.fileName, 'rb+') file.seek(0, 2) else: file = open(self.fileName, 'wb') return file
def pageStart(self, partialContent): """Called on page download start.
@param partialContent: tells us if the download is partial download we requested. """ if partialContent and not self.requestedPartial: raise ValueError, "we shouldn't get partial content response if we didn't want it!" if self.waiting: try: if not self.file: self.file = self.openFile(partialContent) except IOError: #raise self.deferred.errback(failure.Failure())
def pagePart(self, data): if not self.file: return try: self.file.write(data) except IOError: #raise self.file = None self.deferred.errback(failure.Failure())
def noPage(self, reason): """ Close the storage file and errback the waiting L{Deferred} with the given reason. """ if self.waiting: self.waiting = 0 if self.file: try: self.file.close() except: log.err(None, "Error closing HTTPDownloader file") self.deferred.errback(reason)
def pageEnd(self): self.waiting = 0 if not self.file: return try: self.file.close() except IOError: self.deferred.errback(failure.Failure()) return self.deferred.callback(self.value)
def _parse(url, defaultPort=None): """ Split the given URL into the scheme, host, port, and path.
@type url: C{str} @param url: An URL to parse.
@type defaultPort: C{int} or C{None} @param defaultPort: An alternate value to use as the port if the URL does not include one.
@return: A four-tuple of the scheme, host, port, and path of the URL. All of these are C{str} instances except for port, which is an C{int}. """ url = url.strip() parsed = http.urlparse(url) scheme = parsed[0] path = urlunparse(('', '') + parsed[2:])
if defaultPort is None: if scheme == 'https': defaultPort = 443 else: defaultPort = 80
host, port = parsed[1], defaultPort if ':' in host: host, port = host.split(':') try: port = int(port) except ValueError: port = defaultPort
if path == '': path = '/'
return scheme, host, port, path
def _makeGetterFactory(url, factoryFactory, contextFactory=None, *args, **kwargs): """ Create and connect an HTTP page getting factory.
Any additional positional or keyword arguments are used when calling C{factoryFactory}.
@param factoryFactory: Factory factory that is called with C{url}, C{args} and C{kwargs} to produce the getter
@param contextFactory: Context factory to use when creating a secure connection, defaulting to C{None}
@return: The factory created by C{factoryFactory} """ scheme, host, port, path = _parse(url) factory = factoryFactory(url, *args, **kwargs) if scheme == 'https': from twisted.internet import ssl if contextFactory is None: contextFactory = ssl.ClientContextFactory() reactor.connectSSL(host, port, factory, contextFactory) else: reactor.connectTCP(host, port, factory) return factory
def getPage(url, contextFactory=None, *args, **kwargs): """ Download a web page as a string.
Download a page. Return a deferred, which will callback with a page (as a string) or errback with a description of the error.
See HTTPClientFactory to see what extra args can be passed. """ return _makeGetterFactory( url, HTTPClientFactory, contextFactory=contextFactory, *args, **kwargs).deferred
def downloadPage(url, file, contextFactory=None, *args, **kwargs): """ Download a web page to a file.
@param file: path to file on filesystem, or file-like object.
See HTTPDownloader to see what extra args can be passed. """ factoryFactory = lambda url, *a, **kw: HTTPDownloader(url, file, *a, **kw) return _makeGetterFactory( url, factoryFactory, contextFactory=contextFactory, *args, **kwargs).deferred
# The code which follows is based on the new HTTP client implementation. It # should be significantly better than anything above, though it is not yet # feature equivalent.
from twisted.internet.protocol import ClientCreator from twisted.web.error import SchemeNotSupported from twisted.web._newclient import ResponseDone, Request, HTTP11ClientProtocol from twisted.web._newclient import Response
try: from twisted.internet.ssl import ClientContextFactory except ImportError: class WebClientContextFactory(object): """ A web context factory which doesn't work because the necessary SSL support is missing. """ def getContext(self, hostname, port): raise NotImplementedError("SSL support unavailable") else: class WebClientContextFactory(ClientContextFactory): """ A web context factory which ignores the hostname and port and does no certificate verification. """ def getContext(self, hostname, port): return ClientContextFactory.getContext(self)
class _WebToNormalContextFactory(object): """ Adapt a web context factory to a normal context factory.
@ivar _webContext: A web context factory which accepts a hostname and port number to its C{getContext} method.
@ivar _hostname: The hostname which will be passed to C{_webContext.getContext}.
@ivar _port: The port number which will be passed to C{_webContext.getContext}. """ def __init__(self, webContext, hostname, port): self._webContext = webContext self._hostname = hostname self._port = port
def getContext(self): """ Called the wrapped web context factory's C{getContext} method with a hostname and port number and return the resulting context object. """ return self._webContext.getContext(self._hostname, self._port)
class Agent(object): """ L{Agent} is a very basic HTTP client. It supports I{HTTP} and I{HTTPS} scheme URIs (but performs no certificate checking by default). It does not support persistent connections.
@ivar _reactor: The L{IReactorTCP} and L{IReactorSSL} implementation which will be used to set up connections over which to issue requests.
@ivar _contextFactory: A web context factory which will be used to create SSL context objects for any SSL connections the agent needs to make.
@since: 9.0 """ _protocol = HTTP11ClientProtocol
def __init__(self, reactor, contextFactory=WebClientContextFactory()): self._reactor = reactor self._contextFactory = contextFactory
def _wrapContextFactory(self, host, port): """ Create and return a normal context factory wrapped around C{self._contextFactory} in such a way that C{self._contextFactory} will have the host and port information passed to it.
@param host: A C{str} giving the hostname which will be connected to in order to issue a request.
@param port: An C{int} giving the port number the connection will be on.
@return: A context factory suitable to be passed to C{reactor.connectSSL}. """ return _WebToNormalContextFactory(self._contextFactory, host, port)
def _connect(self, scheme, host, port): """ Connect to the given host and port, using a transport selected based on scheme.
@param scheme: A string like C{'http'} or C{'https'} (the only two supported values) to use to determine how to establish the connection.
@param host: A C{str} giving the hostname which will be connected to in order to issue a request.
@param port: An C{int} giving the port number the connection will be on.
@return: A L{Deferred} which fires with a connected instance of C{self._protocol}. """ cc = ClientCreator(self._reactor, self._protocol) if scheme == 'http': d = cc.connectTCP(host, port) elif scheme == 'https': d = cc.connectSSL(host, port, self._wrapContextFactory(host, port)) else: d = defer.fail(SchemeNotSupported( "Unsupported scheme: %r" % (scheme,))) return d
def request(self, method, uri, headers=None, bodyProducer=None): """ Issue a new request.
@param method: The request method to send. @type method: C{str}
@param uri: The request URI send. @type uri: C{str}
@param headers: The request headers to send. If no I{Host} header is included, one will be added based on the request URI. @type headers: L{Headers}
@param bodyProducer: An object which will produce the request body or, if the request body is to be empty, L{None}. @type bodyProducer: L{IBodyProducer} provider
@return: A L{Deferred} which fires with the result of the request (a L{Response} instance), or fails if there is a problem setting up a connection over which to issue the request. It may also fail with L{SchemeNotSupported} if the scheme of the given URI is not supported. @rtype: L{Deferred} """ scheme, host, port, path = _parse(uri) d = self._connect(scheme, host, port) if headers is None: headers = Headers() if not headers.hasHeader('host'): # This is a lot of copying. It might be nice if there were a bit # less. headers = Headers(dict(headers.getAllRawHeaders())) headers.addRawHeader( 'host', self._computeHostValue(scheme, host, port)) def cbConnected(proto): return proto.request(Request(method, path, headers, bodyProducer)) d.addCallback(cbConnected) return d
def _computeHostValue(self, scheme, host, port): """ Compute the string to use for the value of the I{Host} header, based on the given scheme, host name, and port number. """ if (scheme, port) in (('http', 80), ('https', 443)): return host return '%s:%d' % (host, port)
__all__ = [ 'PartialDownloadError', 'HTTPPageGetter', 'HTTPPageDownloader', 'HTTPClientFactory', 'HTTPDownloader', 'getPage', 'downloadPage',
'ResponseDone', 'Response', 'Agent']
|