"""Fonduer textual modality utilities."""
from itertools import chain
from typing import Iterator, Union
from fonduer.candidates.models import Candidate, Mention
from fonduer.candidates.models.span_mention import TemporarySpanMention
from fonduer.utils.data_model_utils.utils import _to_span, _to_spans
from fonduer.utils.utils import tokens_to_ngrams
[docs]def same_sentence(c: Candidate) -> bool:
"""Return True if all Mentions in the given candidate are from the same Sentence.
:param c: The candidate whose Mentions are being compared
"""
return all(
_to_span(c[i]).sentence is not None
and _to_span(c[i]).sentence == _to_span(c[0]).sentence
for i in range(len(c))
)
[docs]def get_between_ngrams(
c: Candidate,
attrib: str = "words",
n_min: int = 1,
n_max: int = 1,
lower: bool = True,
) -> Iterator[str]:
"""Return the ngrams *between* two unary Mentions of a binary-Mention Candidate.
Get the ngrams *between* two unary Mentions of a binary-Mention Candidate,
where both share the same sentence Context.
:param c: The binary-Mention Candidate to evaluate.
:param attrib: The token attribute type (e.g. words, lemmas, poses)
:param n_min: The minimum n of the ngrams that should be returned
:param n_max: The maximum n of the ngrams that should be returned
:param lower: If 'True', all ngrams will be returned in lower case
"""
if len(c) != 2:
raise ValueError("Only applicable to binary Candidates")
span0 = _to_span(c[0])
span1 = _to_span(c[1])
if span0.sentence != span1.sentence:
raise ValueError(
"Only applicable to Candidates where both spans are \
from the same immediate Context."
)
distance = abs(span0.get_word_start_index() - span1.get_word_start_index())
if span0.get_word_start_index() < span1.get_word_start_index():
for ngram in get_right_ngrams(
span0,
window=distance - 1,
attrib=attrib,
n_min=n_min,
n_max=n_max,
lower=lower,
):
yield ngram
else: # span0.get_word_start_index() > span1.get_word_start_index()
for ngram in get_right_ngrams(
span1,
window=distance - 1,
attrib=attrib,
n_min=n_min,
n_max=n_max,
lower=lower,
):
yield ngram
[docs]def get_left_ngrams(
mention: Union[Candidate, Mention, TemporarySpanMention],
window: int = 3,
attrib: str = "words",
n_min: int = 1,
n_max: int = 1,
lower: bool = True,
) -> Iterator[str]:
"""Get the ngrams within a window to the *left* from the sentence Context.
For higher-arity Candidates, defaults to the *first* argument.
:param mention: The Mention to evaluate. If a candidate is given, default
to its first Mention.
:param window: The number of tokens to the left of the first argument to
return.
:param attrib: The token attribute type (e.g. words, lemmas, poses)
:param n_min: The minimum n of the ngrams that should be returned
:param n_max: The maximum n of the ngrams that should be returned
:param lower: If True, all ngrams will be returned in lower case
"""
span = _to_span(mention)
i = span.get_word_start_index()
for ngram in tokens_to_ngrams(
getattr(span.sentence, attrib)[max(0, i - window) : i],
n_min=n_min,
n_max=n_max,
lower=lower,
):
yield ngram
[docs]def get_right_ngrams(
mention: Union[Candidate, Mention, TemporarySpanMention],
window: int = 3,
attrib: str = "words",
n_min: int = 1,
n_max: int = 1,
lower: bool = True,
) -> Iterator[str]:
"""Get the ngrams within a window to the *right* from the sentence Context.
For higher-arity Candidates, defaults to the *last* argument.
:param mention: The Mention to evaluate. If a candidate is given, default
to its last Mention.
:param window: The number of tokens to the left of the first argument to
return
:param attrib: The token attribute type (e.g. words, lemmas, poses)
:param n_min: The minimum n of the ngrams that should be returned
:param n_max: The maximum n of the ngrams that should be returned
:param lower: If True, all ngrams will be returned in lower case
"""
span = _to_span(mention, idx=-1)
i = span.get_word_end_index()
for ngram in tokens_to_ngrams(
getattr(span.sentence, attrib)[i + 1 : i + 1 + window],
n_min=n_min,
n_max=n_max,
lower=lower,
):
yield ngram
[docs]def get_sentence_ngrams(
mention: Union[Candidate, Mention, TemporarySpanMention],
attrib: str = "words",
n_min: int = 1,
n_max: int = 1,
lower: bool = True,
) -> Iterator[str]:
"""Get the ngrams that are in the Sentence of the given Mention, not including itself.
Note that if a candidate is passed in, all of its Mentions will be
searched.
:param mention: The Mention whose Sentence is being searched
:param attrib: The token attribute type (e.g. words, lemmas, poses)
:param n_min: The minimum n of the ngrams that should be returned
:param n_max: The maximum n of the ngrams that should be returned
:param lower: If True, all ngrams will be returned in lower case
"""
spans = _to_spans(mention)
for span in spans:
for ngram in get_left_ngrams(
span, window=100, attrib=attrib, n_min=n_min, n_max=n_max, lower=lower
):
yield ngram
for ngram in get_right_ngrams(
span, window=100, attrib=attrib, n_min=n_min, n_max=n_max, lower=lower
):
yield ngram
[docs]def get_neighbor_sentence_ngrams(
mention: Union[Candidate, Mention, TemporarySpanMention],
d: int = 1,
attrib: str = "words",
n_min: int = 1,
n_max: int = 1,
lower: bool = True,
) -> Iterator[str]:
"""Get the ngrams that are in the neighoring Sentences of the given Mention.
Note that if a candidate is passed in, all of its Mentions will be searched.
:param mention: The Mention whose neighbor Sentences are being searched
:param attrib: The token attribute type (e.g. words, lemmas, poses)
:param n_min: The minimum n of the ngrams that should be returned
:param n_max: The maximum n of the ngrams that should be returned
:param lower: If True, all ngrams will be returned in lower case
"""
spans = _to_spans(mention)
for span in spans:
for ngram in chain.from_iterable(
[
tokens_to_ngrams(
getattr(sentence, attrib), n_min=n_min, n_max=n_max, lower=lower
)
for sentence in span.sentence.document.sentences
if abs(sentence.position - span.sentence.position) <= d
and sentence != span.sentence
]
):
yield ngram