Source code for fonduer.utils.data_model_utils.structural

"""Fonduer structural modality utilities."""
import functools
from builtins import str
from typing import List, Optional, Tuple, Union

import numpy as np
from lxml import etree
from lxml.etree import _ElementTree
from lxml.html import HtmlElement, fromstring

from fonduer.candidates.models import Candidate, Mention
from fonduer.candidates.models.span_mention import SpanMention, TemporarySpanMention
from fonduer.parser.models.sentence import Sentence
from fonduer.utils.data_model_utils.utils import _to_span


[docs]def get_tag(mention: Union[Candidate, Mention, TemporarySpanMention]) -> str: """Return the HTML tag of the Mention. If a candidate is passed in, only the tag of its first Mention is returned. These may be tags such as 'p', 'h2', 'table', 'div', etc. :param mention: The Mention to evaluate """ span = _to_span(mention) return str(span.sentence.html_tag)
[docs]def get_attributes( mention: Union[Candidate, Mention, TemporarySpanMention] ) -> List[str]: """Return the HTML attributes of the Mention. If a candidate is passed in, only the tag of its first Mention is returned. A sample outout of this function on a Mention in a paragraph tag is [u'style=padding-top: 8pt;padding-left: 20pt;text-indent: 0pt;text-align: left;'] :param mention: The Mention to evaluate :return: list of strings representing HTML attributes """ span = _to_span(mention) return span.sentence.html_attrs
@functools.lru_cache(maxsize=16) def _get_etree_for_text(text: str) -> _ElementTree: return etree.ElementTree(fromstring(text)) def _get_node(sentence: Sentence) -> HtmlElement: # Using caching to speed up retrieve process doc_etree = _get_etree_for_text(sentence.document.text) return doc_etree.xpath(sentence.xpath)[0]
[docs]def get_parent_tag( mention: Union[Candidate, Mention, TemporarySpanMention] ) -> Optional[str]: """Return the HTML tag of the Mention's parent. These may be tags such as 'p', 'h2', 'table', 'div', etc. If a candidate is passed in, only the tag of its first Mention is returned. :param mention: The Mention to evaluate """ span = _to_span(mention) i = _get_node(span.sentence) return str(i.getparent().tag) if i.getparent() is not None else None
[docs]def get_prev_sibling_tags( mention: Union[Candidate, Mention, TemporarySpanMention] ) -> List[str]: """Return the HTML tag of the Mention's previous siblings. Previous siblings are Mentions which are at the same level in the HTML tree as the given mention, but are declared before the given mention. If a candidate is passed in, only the previous siblings of its first Mention are considered in the calculation. :param mention: The Mention to evaluate """ span = _to_span(mention) prev_sibling_tags: List[str] = [] i = _get_node(span.sentence) while i.getprevious() is not None: prev_sibling_tags.insert(0, str(i.getprevious().tag)) i = i.getprevious() return prev_sibling_tags
[docs]def get_next_sibling_tags( mention: Union[Candidate, Mention, TemporarySpanMention] ) -> List[str]: """Return the HTML tag of the Mention's next siblings. Next siblings are Mentions which are at the same level in the HTML tree as the given mention, but are declared after the given mention. If a candidate is passed in, only the next siblings of its last Mention are considered in the calculation. :param mention: The Mention to evaluate """ span = _to_span(mention) next_sibling_tags = [] i = _get_node(span.sentence) while i.getnext() is not None: next_sibling_tags.append(str(i.getnext().tag)) i = i.getnext() return next_sibling_tags
[docs]def get_ancestor_class_names( mention: Union[Candidate, Mention, TemporarySpanMention] ) -> List[str]: """Return the HTML classes of the Mention's ancestors. If a candidate is passed in, only the ancestors of its first Mention are returned. :param mention: The Mention to evaluate """ span = _to_span(mention) class_names: List[str] = [] i = _get_node(span.sentence) while i is not None: class_names.insert(0, str(i.get("class"))) i = i.getparent() return class_names
[docs]def get_ancestor_tag_names( mention: Union[Candidate, Mention, TemporarySpanMention] ) -> List[str]: """Return the HTML tag of the Mention's ancestors. For example, ['html', 'body', 'p']. If a candidate is passed in, only the ancestors of its first Mention are returned. :param mention: The Mention to evaluate """ span = _to_span(mention) tag_names: List[str] = [] i = _get_node(span.sentence) while i is not None: tag_names.insert(0, str(i.tag)) i = i.getparent() return tag_names
[docs]def get_ancestor_id_names( mention: Union[Candidate, Mention, TemporarySpanMention] ) -> List[str]: """Return the HTML id's of the Mention's ancestors. If a candidate is passed in, only the ancestors of its first Mention are returned. :param mention: The Mention to evaluate """ span = _to_span(mention) id_names: List[str] = [] i = _get_node(span.sentence) while i is not None: id_names.insert(0, str(i.get("id"))) i = i.getparent() return id_names
[docs]def common_ancestor(c: Tuple[SpanMention, ...]) -> List[str]: """Return the path to the root that is shared between a multinary-Mention Candidate. In particular, this is the common path of HTML tags. :param c: The multinary-Mention Candidate to evaluate """ spans = [_to_span(i) for i in c] ancestors = [np.array(span.sentence.xpath.split("/")) for span in spans] min_len = min([a.size for a in ancestors]) ancestor = ancestors[0] ind = 0 # all the ancestors are common up to this index (exclusive). while ind < min_len: if not all([a[ind] == ancestor[ind] for a in ancestors]): break ind += 1 return list(ancestors[0][:ind])
[docs]def lowest_common_ancestor_depth(c: Tuple[SpanMention, ...]) -> int: """Return the lowest common ancestor depth. In particular, return the minimum distance between a multinary-Mention Candidate to their lowest common ancestor. For example, if the tree looked like this:: html ├──<div> Mention 1 </div> ├──table │ ├──tr │ │ └──<th> Mention 2 </th> we return 1, the distance from Mention 1 to the html root. Smaller values indicate that two Mentions are close structurally, while larger values indicate that two Mentions are spread far apart structurally in the document. :param c: The multinary-Mention Candidate to evaluate """ spans = [_to_span(i) for i in c] ancestors = [np.array(span.sentence.xpath.split("/")) for span in spans] min_len = min([a.size for a in ancestors]) ancestor = ancestors[0] ind = 0 # all the ancestors are common up to this index (exclusive). while ind < min_len: if not all([a[ind] == ancestor[ind] for a in ancestors]): break ind += 1 return min_len - ind