""" Synchronous webscraping. """ import shelve from urllib.request import urlopen from parse_html import parse class LanguageInfluences(object): """Find programming languages and the their influnces. """ def __init__(self, start_name='Python', start_url='/wiki/Python_(programming_language)', limit=100, url_base='http://en.wikipedia.org', db_name='languages'): self.start_name = start_name self.start_url = start_url self.limit = limit self.db = shelve.open(db_name) self.url_base = url_base self.categories = ['Influenced by', 'Influenced'] def get_url_content(self, url, name): """Get and parse the content of the URL. """ if not url.startswith('/wiki/'): print('Skipping URL:', url) return None full_url = self.url_base + url print('fetching', full_url) with urlopen(full_url) as http_response: html = http_response.read() return self.parse(html, url, name) def parse(self, html, url, name): """Parse and store in DB. """ if html: content = parse(html) data = {'name': name, 'content': content} self.db[url] = data return content def work(self, content=None): """Process urls recursively. """ if not content and self.start_url not in self.db: content = self.get_url_content(self.start_url, self.start_name) if content and len(self.db) <= self.limit: for category in self.categories: if category not in content: continue for url, names in content[category].items(): if not url or url == 'None': continue name = next(iter(names)) if url not in self.db: self.work(self.get_url_content(url, name)) if __name__ == '__main__': def test(): """Run and measure the runtime. """ import timeit start = timeit.default_timer() langs = LanguageInfluences() langs.work() duration = timeit.default_timer() - start print(duration) test()