A tool for scraping definitions of Russian words from Wikitionary

In my perpetual attempt to make my language learning process using Anki more efficient, I’ve written a tool to extract English-language definitions from Russian words from Wiktionary. I wrote about the idea previously in Scraping Russian word definitions from Wikitionary: utility for Anki but it relied on the WiktionaryParser module which is good but misses some important edge cases. So I rolled up my sleeves and crafted my own solution. As with WiktionaryParser the heavy-lifting is done by the Beautiful Soup parser. Much of the logic of this tool is around detecting the edge cases that I mentioned. For example, the underlying HTML format changes when we’re dealing with a word that has multiple etymologies versus those with a single etymology. Whenever you’re doing web scraping you have to account for those sorts of variations.

Code

#!/usr/bin/env python3
# -*- coding: utf-8 -*-

from urllib.request import urlopen, Request
import urllib.parse
from http.client import HTTPResponse
from random_user_agent.user_agent import UserAgent
from random_user_agent.params import SoftwareName, OperatingSystem, HardwareType
import copy
import re
import sys
from bs4 import BeautifulSoup, element

def remove_html_comments(html: str) -> str:
    """
    Strips HTML comments. See https://stackoverflow.com/a/57996414
    :param html: html string to process
    :return: html string with comments stripped
    """
    result = re.sub(r'(<!--.*?-->)|(<!--[\S\s]+?-->)|(<!--[\S\s]*?$)', "", html)
    return result

def extract_russian_soup(response: HTTPResponse) -> BeautifulSoup:
   new_soup = BeautifulSoup('', 'html.parser')
   # remove HTML comments before processing
   html_str = response.read().decode('UTF-8')
   cleaner_html = remove_html_comments(html_str)
   soup = BeautifulSoup(cleaner_html, 'html.parser')
   # get rid of certain tags to make it lighter
   # to work with
   [s.extract() for s in soup(['head', 'script', 'footer'])]
   for h2 in soup.find_all('h2'):
      for span in h2.children:
         try:
            if 'Russian' in span['id']:
               new_soup.append(copy.copy(h2))
               # capture everything in the Russian section
               for curr_sibling in h2.next_siblings:
                  if curr_sibling.name == "h2":
                     break
                  else:
                     new_soup.append(copy.copy(curr_sibling))
               break
         except:
            pass
   return new_soup
   
def check_excluded_ids(span_id: str) -> bool:
   excluded = ['Pronunciation', 'Alternative_forms', 'Etymology']
   for ex in excluded:
      if re.search(ex, span_id, re.IGNORECASE):
         return True
   return False

def remove_dl_ul(li: element.Tag) -> element.Tag:
   try:
      dl_extract = li.dl.extract()
   except AttributeError:
      pass
      # sometimes citations are presented in <ul> so remove
   try:
      ul_extract = li.ul.extract()
   except AttributeError:
      pass
   return li

def url_from_ru_word(raw_word:str) -> str:
   # strip syllabic stress diacritical marks
   raw_word = re.sub(r'\u0301|\u0300', "", raw_word)
   raw_word = raw_word.replace(" ", "_").strip()
   word = urllib.parse.quote(raw_word)
   return f'https://en.wiktionary.org/wiki/{word}#Russian'

def request_headers() -> dict:
   hn = [HardwareType.COMPUTER.value]
   user_agent_rotator = UserAgent(hardware_types=hn,limit=20)
   user_agent = user_agent_rotator.get_random_user_agent()
   return {'user-agent': user_agent}

if __name__ == "__main__":
   __version__ = 1.0
   
   # accept word as either argument or on stdin
   try:
      raw_word = sys.argv[1]
   except IndexError:
      raw_word = sys.stdin.read()
      
   url = url_from_ru_word(raw_word)
   headers = request_headers()
   
   try:
      response = urlopen(Request(url, headers = headers))
   except urllib.error.HTTPError as e:
      if e.code == 404:
         print("Error - no such word")
      else:
         print(f"Error: status {e.code}")
      sys.exit(1)
   
   # first extract the Russian content because
   # we may have other languages. This just
   # simplifies the parsing for the headword
   new_soup = extract_russian_soup(response)
            
   # use the derived soup to pick out the headword from
   # the Russian-specific content
   definitions = []
   
   # there are cases (as with the word 'бухта' where there are
   # multiple etymologies. In these cases, the page structure is
   # different. We will try both structures.
   
   for tag in ['h3', 'h4']:
      for h3_or_h4 in new_soup.find_all(tag):
         found = False
         for h3_or_h4_child in h3_or_h4.children:
            if h3_or_h4_child.name == 'span':
               if h3_or_h4_child.get('class'):
                  span_classes = h3_or_h4_child.get('class')
                  if 'mw-headline' in span_classes:
                     span_id = h3_or_h4_child.get('id')
                     # exclude any h3 whose span is not a part of speech
                     if not check_excluded_ids(span_id):
                        found = True
                     break
         if found:
            ol = h3_or_h4.find_next_sibling('ol')
            if ol is None:
               continue
            lis = ol.children
            for li in lis:
               # skip '\n' children
               if li.name != 'li':
                  continue
               # remove any extraneous detail tags + children, etc.
               li = remove_dl_ul(li)
               li_def = li.text.strip()
               definitions.append(li_def)
   definition_list = '; '.join(definitions)
   # if a definition has a single line, remove the ;\s
   definition_list = re.sub(r'^(?:;\s)+(.*)$', '\\1', definition_list)
   # remove "see also" links
   definition_list = re.sub(r'\(see also[^\)]*\)+', "", definition_list)
   print(definition_list)

Usage

The script works flexibly accepting a Russian language word either from stdin or as the first argument. For example

echo "собака" | ruendef # or
ruendef "собака"

Both print out:

dog; hound; (derogatory, figuratively) mongrel, cur, bastard (a detestable person); (colloquial, figuratively) fox (a clever, capable person); (Internet) @ (at sign); (computing slang) watchdog timer