Source code for fonduer.parser.preprocessors.text_doc_preprocessor

"""Fonduer text document preprocessor."""
import codecs
import os
from typing import Iterator

from fonduer.parser.models import Document
from fonduer.parser.preprocessors.doc_preprocessor import DocPreprocessor
from fonduer.utils.utils_parser import build_node


[docs]class TextDocPreprocessor(DocPreprocessor): """A ``Document`` generator for plain text files.""" def _parse_file(self, fp: str, file_name: str) -> Iterator[Document]: with codecs.open(fp, encoding=self.encoding) as f: name = os.path.basename(fp).rsplit(".", 1)[0] stable_id = self._get_stable_id(name) text = build_node("doc", None, build_node("text", None, f.read().strip())) yield Document( name=name, stable_id=stable_id, text=text, meta={"file_name": file_name} ) def __len__(self) -> int: """Provide a len attribute based on max_docs and number of files in folder.""" num_docs = min(len(self.all_files), self.max_docs) return num_docs