Source code for fonduer.parser.preprocessors.html_doc_preprocessor

"""Fonduer HTML document preprocessor."""
import codecs
import os
from typing import Iterator

from bs4 import BeautifulSoup

from fonduer.parser.models import Document
from fonduer.parser.preprocessors.doc_preprocessor import DocPreprocessor


[docs]class HTMLDocPreprocessor(DocPreprocessor): """A ``Document`` generator for HTML files.""" def _parse_file(self, fp: str, file_name: str) -> Iterator[Document]: with codecs.open(fp, encoding=self.encoding) as f: soup = BeautifulSoup(f, "lxml") all_html_elements = soup.find_all("html") if len(all_html_elements) != 1: raise NotImplementedError( f"Expecting exactly one html element per html file: {file_name}" ) text = all_html_elements[0] name = os.path.basename(fp)[: os.path.basename(fp).rfind(".")] stable_id = self._get_stable_id(name) yield Document( name=name, stable_id=stable_id, text=str(text), meta={"file_name": file_name}, ) def __len__(self) -> int: """Provide a len attribute based on max_docs and number of files in folder.""" num_docs = min(len(self.all_files), self.max_docs) return num_docs def _can_read(self, fpath: str) -> bool: return fpath.lower().endswith("html") # includes both .html and .xhtml