"""Asynchronous webscraping. """ import asyncio import aiohttp from get_synchronous import LanguageInfluences from parse_html import parse class AsyncLanguageInfluences(LanguageInfluences): """Asynchronous version. """ @asyncio.coroutine def get_url_content(self, url, name): """Get and parse the content of the URL. """ if not url.startswith('/wiki/'): print('Skipping URL:', url) return None full_url = self.url_base + url print('fetching', full_url) http_response = yield from aiohttp.request('GET', full_url) html = (yield from http_response.read_and_close()) return (yield from self.parse(html, url, name)) @asyncio.coroutine def parse(self, html, url, name): """Parse and store in DB. """ if html: content = parse(html) data = {'name': name, 'content': content} self.db[url] = data return content @asyncio.coroutine def work(self, content=None): """Process urls recursively. """ if not content and self.start_url not in self.db: content = yield from self.get_url_content(self.start_url, self.start_name) if content and len(self.db) <= self.limit: for category in self.categories: for url, names in content[category].items(): if not url or url == 'None': continue name = next(iter(names)) if url not in self.db: content = yield from self.get_url_content(url, name) yield from self.work(content) if __name__ == '__main__': def test(): """Run and measure the runtime. """ import timeit start = timeit.default_timer() langs = AsyncLanguageInfluences() loop = asyncio.get_event_loop() loop.run_until_complete(langs.work()) duration = timeit.default_timer() - start print(duration) test()