"""Fonduer mention."""
import logging
import re
from builtins import map, range
from typing import Any, Collection, Dict, Iterable, Iterator, List, Optional, Set, Union
from sqlalchemy.orm import Session
from fonduer.candidates.matchers import _Matcher
from fonduer.candidates.models import Candidate, Mention
from fonduer.candidates.models.candidate import candidate_subclasses
from fonduer.candidates.models.caption_mention import TemporaryCaptionMention
from fonduer.candidates.models.cell_mention import TemporaryCellMention
from fonduer.candidates.models.document_mention import TemporaryDocumentMention
from fonduer.candidates.models.figure_mention import TemporaryFigureMention
from fonduer.candidates.models.paragraph_mention import TemporaryParagraphMention
from fonduer.candidates.models.section_mention import TemporarySectionMention
from fonduer.candidates.models.span_mention import TemporarySpanMention
from fonduer.candidates.models.table_mention import TemporaryTableMention
from fonduer.candidates.models.temporary_context import TemporaryContext
from fonduer.parser.models import Context, Document, Sentence
from fonduer.utils.udf import UDF, UDFRunner
from fonduer.utils.utils import get_dict_of_stable_id
logger = logging.getLogger(__name__)
[docs]class MentionSpace(object):
"""Define the **space** of Mention objects.
Calling *apply(x)* given an object *x* returns a generator over mentions in
*x*.
"""
def __init__(self) -> None:
"""Initialize mention space."""
pass
def apply(self, x: Context) -> Iterator[TemporaryContext]:
"""Apply function takes a Context and return a mention generator.
:param x: The input Context.
:yield: The mention generator.
"""
raise NotImplementedError()
[docs]class Ngrams(MentionSpace):
"""Define the space of Mentions as all n-grams in a Sentence.
Define the space of Mentions as all n-grams (n_min <= n <= n_max) in a
Sentence *x*, indexing by **character offset**.
:param n_min: Lower limit for the generated n_grams.
:param n_max: Upper limit for the generated n_grams.
:param split_tokens: Tokens, on which unigrams are split into two separate
unigrams.
:type split_tokens: tuple, list of str.
"""
def __init__(
self, n_min: int = 1, n_max: int = 5, split_tokens: Collection[str] = []
) -> None:
"""Initialize Ngrams."""
MentionSpace.__init__(self)
self.n_min = n_min
self.n_max = n_max
self.split_rgx = (
r"(" + r"|".join(map(re.escape, sorted(split_tokens, reverse=True))) + r")"
if split_tokens and len(split_tokens) > 0
else None
)
def apply(self, context: Sentence) -> Iterator[TemporarySpanMention]:
"""Apply function takes a Sentence and return a mention generator.
:param x: The input Sentence.
:yield: The mention generator.
"""
# These are the character offset--**relative to the sentence
# start**--for each _token_
offsets = context.char_offsets
# Loop over all n-grams in **reverse** order (to facilitate
# longest-match semantics)
L = len(offsets)
seen: Set[TemporarySpanMention] = set()
for j in range(self.n_min, self.n_max + 1)[::-1]:
for i in range(L - j + 1):
w = context.words[i + j - 1]
start = offsets[i]
end = offsets[i + j - 1] + len(w) - 1
ts = TemporarySpanMention(
char_start=start, char_end=end, sentence=context
)
if ts not in seen:
seen.add(ts)
yield ts
# Check for split
if (
j == 1
and self.n_max >= 1
and self.n_min <= 1
and self.split_rgx is not None
and end - start > 0
):
text = context.text[start - offsets[0] : end - offsets[0] + 1]
start_idxs = [0]
end_idxs = []
for m in re.finditer(self.split_rgx, text):
start_idxs.append(m.end())
end_idxs.append(m.start())
end_idxs.append(len(text))
for start_idx in start_idxs:
for end_idx in end_idxs:
if start_idx < end_idx:
ts = TemporarySpanMention(
char_start=start_idx,
char_end=end_idx - 1,
sentence=context,
)
if ts not in seen and ts.get_span():
seen.add(ts)
yield ts
[docs]class MentionNgrams(Ngrams):
"""Defines the **space** of Mentions as n-grams in a Document.
Defines the space of Mentions as all n-grams (n_min <= n <= n_max) in a
Document *x*, divided into Sentences inside of html elements (such as table
cells).
:param n_min: Lower limit for the generated n_grams.
:param n_max: Upper limit for the generated n_grams.
:param split_tokens: Tokens, on which unigrams are split into two separate
unigrams.
:type split_tokens: tuple, list of str.
"""
def __init__(
self, n_min: int = 1, n_max: int = 5, split_tokens: Collection[str] = []
) -> None:
"""Initialize MentionNgrams."""
Ngrams.__init__(self, n_min=n_min, n_max=n_max, split_tokens=split_tokens)
def apply(self, doc: Document) -> Iterator[TemporarySpanMention]:
"""Generate MentionNgrams from a Document by parsing all of its Sentences.
:param doc: The ``Document`` to parse.
:raises TypeError: If the input doc is not of type ``Document``.
"""
if not isinstance(doc, Document):
raise TypeError(
"Input Contexts to MentionNgrams.apply() must be of type Document"
)
for sentence in doc.sentences:
for ts in Ngrams.apply(self, sentence):
yield ts
[docs]class MentionSentences(MentionSpace):
"""Defines the space of Mentions as all sentences in a Document *x*."""
def __init__(self) -> None:
"""Initialize MentionSentences."""
MentionSpace.__init__(self)
def apply(self, doc: Document) -> Iterator[TemporarySpanMention]:
"""
Generate MentionSentences from a Document by parsing all of its Sentences.
:param doc: The ``Document`` to parse.
:raises TypeError: If the input doc is not of type ``Document``.
"""
if not isinstance(doc, Document):
raise TypeError(
"Input Contexts to MentionSentences.apply() must be of type Document"
)
for sentence in doc.sentences:
yield TemporarySpanMention(
char_start=0, char_end=len(sentence.text) - 1, sentence=sentence
)
[docs]class MentionParagraphs(MentionSpace):
"""Defines the space of Mentions as all paragraphs in a Document *x*."""
def __init__(self) -> None:
"""Initialize MentionParagraphs."""
MentionSpace.__init__(self)
def apply(self, doc: Document) -> Iterator[TemporaryParagraphMention]:
"""
Generate MentionParagraphs from a Document by parsing all of its Paragraphs.
:param doc: The ``Document`` to parse.
:raises TypeError: If the input doc is not of type ``Document``.
"""
if not isinstance(doc, Document):
raise TypeError(
"Input Contexts to MentionParagraphs.apply() must be of type Document"
)
for paragraph in doc.paragraphs:
yield TemporaryParagraphMention(paragraph)
[docs]class MentionCaptions(MentionSpace):
"""Defines the space of Mentions as all captions in a Document *x*."""
def __init__(self) -> None:
"""Initialize MentionCaptions."""
MentionSpace.__init__(self)
def apply(self, doc: Document) -> Iterator[TemporaryCaptionMention]:
"""
Generate MentionCaptions from a Document by parsing all of its Captions.
:param doc: The ``Document`` to parse.
:raises TypeError: If the input doc is not of type ``Document``.
"""
if not isinstance(doc, Document):
raise TypeError(
"Input Contexts to MentionCaptions.apply() must be of type Document"
)
for caption in doc.captions:
yield TemporaryCaptionMention(caption)
[docs]class MentionCells(MentionSpace):
"""Defines the space of Mentions as all cells in a Document *x*."""
def __init__(self) -> None:
"""Initialize MentionCells."""
MentionSpace.__init__(self)
def apply(self, doc: Document) -> Iterator[TemporaryCellMention]:
"""
Generate MentionCells from a Document by parsing all of its Cells.
:param doc: The ``Document`` to parse.
:raises TypeError: If the input doc is not of type ``Document``.
"""
if not isinstance(doc, Document):
raise TypeError(
"Input Contexts to MentionCells.apply() must be of type Document"
)
for cell in doc.cells:
yield TemporaryCellMention(cell)
[docs]class MentionTables(MentionSpace):
"""Defines the space of Mentions as all tables in a Document *x*."""
def __init__(self) -> None:
"""Initialize MentionTables."""
MentionSpace.__init__(self)
def apply(self, doc: Document) -> Iterator[TemporaryTableMention]:
"""
Generate MentionTables from a Document by parsing all of its Tables.
:param doc: The ``Document`` to parse.
:raises TypeError: If the input doc is not of type ``Document``.
"""
if not isinstance(doc, Document):
raise TypeError(
"Input Contexts to MentionTables.apply() must be of type Document"
)
for table in doc.tables:
yield TemporaryTableMention(table)
[docs]class MentionSections(MentionSpace):
"""Defines the space of Mentions as all sections in a Document *x*."""
def __init__(self) -> None:
"""Initialize MentionSections."""
MentionSpace.__init__(self)
def apply(self, doc: Document) -> Iterator[TemporarySectionMention]:
"""
Generate MentionSections from a Document by parsing all of its Sections.
:param doc: The ``Document`` to parse.
:raises TypeError: If the input doc is not of type ``Document``.
"""
if not isinstance(doc, Document):
raise TypeError(
"Input Contexts to MentionSections.apply() must be of type Document"
)
for section in doc.sections:
yield TemporarySectionMention(section)
[docs]class MentionDocuments(MentionSpace):
"""Defines the space of Mentions as a document in a Document *x*."""
def __init__(self) -> None:
"""Initialize MentionDocuments."""
MentionSpace.__init__(self)
def apply(self, doc: Document) -> Iterator[TemporaryDocumentMention]:
"""
Generate MentionDocuments from a Document by using document.
:param doc: The ``Document`` to parse.
:raises TypeError: If the input doc is not of type ``Document``.
"""
if not isinstance(doc, Document):
raise TypeError(
"Input Contexts to MentionDocuments.apply() must be of type Document"
)
yield TemporaryDocumentMention(doc)
class MentionExtractorUDF(UDF):
"""UDF for performing mention extraction."""
def __init__(
self,
mention_classes: Union[Mention, List[Mention]],
mention_spaces: Union[MentionSpace, List[MentionSpace]],
matchers: Union[_Matcher, List[_Matcher]],
**kwargs: Any,
):
"""Initialize the MentionExtractorUDF."""
self.mention_classes = (
mention_classes
if isinstance(mention_classes, (list, tuple))
else [mention_classes]
)
self.mention_spaces = (
mention_spaces
if isinstance(mention_spaces, (list, tuple))
else [mention_spaces]
)
self.matchers = matchers if isinstance(matchers, (list, tuple)) else [matchers]
# Preallocates internal data structure
self.child_context_set: Set[TemporaryContext] = set()
super().__init__(**kwargs)
def apply(self, doc: Document, **kwargs: Any) -> Document:
"""Extract mentions from the given Document.
:param doc: A document to process.
"""
# Get a dict of stable_id of contexts.
dict_of_stable_id: Dict[str, Context] = get_dict_of_stable_id(doc)
# Iterate over each mention class
for i, mention_class in enumerate(self.mention_classes):
# Generate TemporaryContexts that are children of the context using
# the mention_space and filtered by the Matcher
for child_context in self.matchers[i].apply(
self.mention_spaces[i].apply(doc)
):
# Skip if this temporary context is used by this mention class.
stable_id = child_context.get_stable_id()
if hasattr(doc, mention_class.__tablename__ + "s") and any(
[
m.context.stable_id == stable_id
for m in getattr(doc, mention_class.__tablename__ + "s")
]
):
continue
# Re-use a persisted context if exists.
if stable_id in dict_of_stable_id:
context = dict_of_stable_id[stable_id]
# Persist a temporary context.
else:
context_type = child_context._get_table()
context = context_type(child_context)
dict_of_stable_id[stable_id] = context
mention_args = {"document": doc, "context": context}
# Add Mention to session
mention_class(**mention_args)
return doc