Source code for fonduer.parser.visual_parser.hocr_visual_parser

"""Fonduer visual parser that parses visual information from hOCR."""
import itertools
import re
from typing import Dict, Iterable, Iterator, List, Pattern, Tuple, Union

import spacy
import spacy.gold
from packaging import version
from spacy.gold import align

from fonduer.parser.models import Sentence
from fonduer.parser.visual_parser.visual_parser import VisualParser


[docs]class HocrVisualParser(VisualParser): """Visual Parser for hOCR.""" def __init__( self, replacements: List[Tuple[str, str]] = [ ("[\u2010\u2011\u2012\u2013\u2014\u2212]", "-") ], ): """Initialize a visual parser. :raises ImportError: an error is raised when spaCy is not 2.3.0 or later. """ if version.parse(spacy.__version__) < version.parse("2.3.0"): raise ImportError( f"You are using spaCy {spacy.__version__}, " f"but it should be 2.3.0 or later to use HocrVisualParser." ) self.replacements: List[Tuple[Pattern, str]] = [] for (pattern, replace) in replacements: self.replacements.append((re.compile(pattern, flags=re.UNICODE), replace))
[docs] def parse( self, document_name: str, sentences: Iterable[Sentence] ) -> Iterator[Sentence]: """Parse visual information embedded in sentence's html_attrs. :param document_name: the document name. :param sentences: sentences to be linked with visual information. :return: A generator of ``Sentence``. """ def attrib_parse( html_attrs: List[str], ) -> Dict[str, Union[List[int], List[str]]]: ret: Dict[str, Union[List[int], List[str]]] = {} for attr in html_attrs: key, values = attr.split("=", 1) # split only at the first occurence if key in ["left", "top", "right", "bottom", "ppageno"]: ret[key] = [int(x) for x in values.split()] elif key == "tokens": # Run RegEx replacements for (rgx, replace) in self.replacements: values = rgx.sub(replace, values) ret[key] = values.split() return ret for _, group in itertools.groupby(sentences, key=lambda x: x.xpath): sents = list(group) # Get bbox from document attribs = attrib_parse(sents[0].html_attrs) lefts = attribs["left"] tops = attribs["top"] rights = attribs["right"] bottoms = attribs["bottom"] ppagenos = attribs["ppageno"] # Clear the hocr specific html_attrs for sent in sents: for attr in sent.html_attrs[:]: key, values = attr.split( "=", 1 ) # split only at the first occurence if key in [ "left", "top", "right", "bottom", "ppageno", "tokens", "x_wconf", ]: sent.html_attrs.remove(attr) # Get a list of all tokens represented by ocrx_word in hOCR hocr_tokens = attribs["tokens"] # Get a list of all tokens tokenized by spaCy. spacy_tokens = [word for sent in sents for word in sent.words] # gold.align assumes that both tokenizations add up to the same string. cost, h2s, s2h, h2s_multi, s2h_multi = align(hocr_tokens, spacy_tokens) ptr = 0 # word pointer for sent in sents: sent.left = [] sent.top = [] sent.right = [] sent.bottom = [] sent.page = [] for i, word in enumerate(sent.words): # One-to-one mapping is NOT available if s2h[ptr + i] == -1: if ptr + i in s2h_multi: # One spacy token-to-multi hOCR words left = lefts[s2h_multi[ptr + i]] top = tops[s2h_multi[ptr + i]] right = rights[s2h_multi[ptr + i]] bottom = bottoms[s2h_multi[ptr + i]] ppageno = ppagenos[s2h_multi[ptr + i]] else: h2s_multi_idx = [ k for k, v in h2s_multi.items() if ptr + i == v ] start, end = 0, 0 if h2s_multi_idx: # One hOCR word-to-multi spacy tokens start = h2s_multi_idx[0] end = h2s_multi_idx[-1] + 1 else: start = s2h_multi[i - 1 if i > 0 else 0] end = s2h_multi[i + 1] + 1 # calculate a bbox that can include all left = min(lefts[start:end]) top = min(tops[start:end]) right = max(rights[start:end]) bottom = max(bottoms[start:end]) ppageno = ppagenos[start] # One-to-one mapping is available else: left = lefts[s2h[ptr + i]] top = tops[s2h[ptr + i]] right = rights[s2h[ptr + i]] bottom = bottoms[s2h[ptr + i]] ppageno = ppagenos[s2h[ptr + i]] sent.left.append(left) sent.top.append(top) sent.right.append(right) sent.bottom.append(bottom) sent.page.append(ppageno + 1) # 1-based in Fonduer ptr += len(sent.words) yield sent
[docs] def is_parsable(self, document_name: str) -> bool: """Whether visual information can be parsed. Currently always return True. :param document_name: the document name. """ return True