"""
 Synchronous webscraping.
 """

 import shelve
 from urllib.request import urlopen

 from parse_html import parse


 class LanguageInfluences(object):
     """Find programming languages and the their influnces.
     """

     def __init__(self, start_name='Python',
                  start_url='/wiki/Python_(programming_language)',
                  limit=100,
                  url_base='http://en.wikipedia.org',
                  db_name='languages'):
         self.start_name = start_name
         self.start_url = start_url
         self.limit = limit
         self.db = shelve.open(db_name)
         self.url_base = url_base
         self.categories = ['Influenced by', 'Influenced']

     def get_url_content(self, url, name):
         """Get and parse the content of the URL.
         """
         if not url.startswith('/wiki/'):
             print('Skipping URL:', url)
             return None
         full_url = self.url_base + url
         print('fetching', full_url)
         with urlopen(full_url) as http_response:
             html = http_response.read()
         return self.parse(html, url, name)

     def parse(self, html, url, name):
         """Parse and store in DB.
         """
         if html:
             content = parse(html)
             data = {'name': name, 'content': content}
             self.db[url] = data
             return content

     def work(self, content=None):
         """Process urls recursively.
         """
         if not content and self.start_url not in self.db:
             content = self.get_url_content(self.start_url, self.start_name)
         if content and len(self.db) <= self.limit:
             for category in self.categories:
                 if category not in content:
                     continue
                 for url, names in content[category].items():
                     if not url or url == 'None':
                         continue
                     name = next(iter(names))
                     if url not in self.db:
                         self.work(self.get_url_content(url, name))

 if __name__ == '__main__':

     def test():
         """Run and measure the runtime.
         """
         import timeit
         start = timeit.default_timer()
         langs = LanguageInfluences()
         langs.work()
         duration = timeit.default_timer() - start
         print(duration)

     test()