"""Parsing a Wikipedia page about a programming language. We extract the relationship 'Influenced by' and 'Influenced' to other languages. """ from collections import defaultdict import bs4 from bs4 import BeautifulSoup def _find_info_table(html): """Find the table with info box on the right hand side. This box contains the information about languages that influenced the target language as well as about languages that got influenced by it. """ return html.find('table', attrs={'class': 'infobox vevent'}) def _find_lang_row(info_table, target_header): """Find the row in the table that contains `target_header`. Where `target_header` is either 'Influenced by' or 'Influenced'. """ res = None for row in info_table: header = getattr(row, 'th', None) if header and header.contents[0] == target_header: res = row break return res def _make_lang_map(lang_row): """Map the url to the language. We use a set to hold the language name just in case there are several spellings for a language, i.e. an entry with the same url. """ res = defaultdict(set) tags = (entry for entry in lang_row.find('td') if isinstance(entry, bs4.element.Tag)) for tag in tags: href = tag.get('href') name = str(tag.contents[0]) if href else None if name: res[href].add(name) return res def parse(html_text): """Parse the given HTML. """ html = BeautifulSoup(html_text) info_table = _find_info_table(html) res = {} if not info_table: return res for target_header in ['Influenced by', 'Influenced']: lang_row = _find_lang_row(info_table, target_header) if lang_row: mapping = _make_lang_map(lang_row) else: mapping = {} res[target_header] = mapping return res