More code doodlin' in Python. A web crawler this time.
1: import sys
2: import httplib
3: import urlparse
4: from BeautifulSoup import BeautifulSoup
5:
6: class Crawler:
7: def __init__(self, host, root, depth, handler):
8: self._host = host
9: self._root = root
10: self._depth = depth
11: self._handler = handler
12: self._visited = []
13: self._connection = httplib.HTTPConnection(host)
14:
15: def run(self):
16: self._run(self._root, '', 0)
17:
18: def _run(self, url, parentUrl, currentDepth):
19: # is some clown is using absolute URLs for internal links?
20: url = url.replace('http://' + self._host, '')
21: # bail if we're running too deep
22: if self._depth > 0 and currentDepth > self._depth:
23: return
24: # bail if it's a manky URL
25: if ':' in url or url[0].startswith('#'):
26: return
27:
28: # normalise relative urls
29: if url[0] != '/':
30: index = parentUrl.rfind('/')
31: if index > -1:
32: url = parentUrl[:index] + '/' + url
33: else:
34: url = '/' + url
35:
36: # bail if we've already visited this page
37: if url in self._visited:
38: return
39:
40: page = Page(self._connection, url)
41: self._handler(page)
42: self._visited.append(url)
43:
44: map(self._run, page.urls, [url] * len(page.urls), [currentDepth + 1] * len(page.urls))
45:
46: class Page:
47: def __init__(self, connection, url):
48: self.url = url
49: self.urls = []
50: self.inputs = []
51:
52: # get a list of querystring key
53: querystring = urlparse.urlparse(url).query
54: self.querystring_params = [part.split('=')[0] for part in querystring.split('&')]
55:
56: connection.connect()
57: connection.request('GET', url, headers = {'User-Agent': 'Colourblind Crawler 0.1'})
58: response = connection.getresponse()
59:
60: self.statusCode = response.status
61: if self.statusCode != 200:
62: # handle redirects (location probably isn't relevant to all of them)
63: if self.statusCode >= 300 and self.statusCode < 400:
64: self.urls.append(response.getheader('Location'))
65:
66: # if it's HTML, parse the sucker
67: if 'text/html' in response.getheader('Content-Type'):
68: soup = BeautifulSoup(response.read(), fromEncoding='utf-8')
69: links = soup('a')
70: # grab all the hrefs and remove any blanks
71: self.urls.extend(filter(lambda x: x != None, map(lambda x: x.get('href'), links)))
72: self.inputs.extend(soup('input'))
73: self.inputs.extend(soup('select'))
74: self.inputs.extend(soup('textarea'))
75:
76: connection.close()
77:
78: def print_page(page):
79: print('{0} {1}'.format(page.url.ljust(75, '.'), page.statusCode))
80: for input in page.inputs:
81: name = input.get('name')
82: print('\t{0}'.format(name))
83:
84: if __name__ == '__main__':
85: startPage = '/'
86: depth = 3
87: if len(sys.argv) > 2:
88: startPage = sys.argv[2]
89: if len(sys.argv) > 3:
90: depth = int(sys.argv[3])
91:
92: crawler = Crawler(sys.argv[1], startPage, depth, print_page)
93: crawler.run()
It's far from perfect (I still don't know how best to handle case sensitivity in the URLs), but I wrote this as part of a larger project which will, realistically, never get more than 10% completed. It'd be the shame for the code to never see the light of day, so here it is.