"""Fonduer textual feature extractor."""
from builtins import range
from typing import Any, Callable, Dict, Iterator, List, Set, Tuple, Union
from treedlib import (
Children,
Compile,
Indicator,
LeftNgrams,
LeftSiblings,
Mention,
Ngrams,
Parents,
RightNgrams,
RightSiblings,
compile_relation_feature_generator,
)
from fonduer.candidates.models import Candidate, ImplicitSpanMention, SpanMention
from fonduer.features.feature_libs.tree_structs import corenlp_to_xmltree
from fonduer.utils.config import get_config
from fonduer.utils.data_model_utils import get_left_ngrams, get_right_ngrams
from fonduer.utils.utils import get_as_dict, tokens_to_ngrams
DEF_VALUE = 1
unary_ddlib_feats: Dict[str, Set] = {}
unary_word_feats: Dict[str, Set] = {}
unary_tdl_feats: Dict[str, Set] = {}
multinary_tdl_feats: Dict[str, Set] = {}
settings = get_config()
def _compile_entity_feature_generator() -> Callable:
"""Compile entity feature generator.
Given optional arguments, returns a generator function which accepts an xml
root and a list of indexes for a mention, and will generate relation
features for this entity.
"""
BASIC_ATTRIBS_REL = ["lemma", "dep_label"]
m = Mention(0)
# Basic relation feature templates
temps = [
[Indicator(m, a) for a in BASIC_ATTRIBS_REL],
Indicator(m, "dep_label,lemma"),
# The *first element on the* path to the root: ngram lemmas along it
Ngrams(Parents(m, 3), "lemma", (1, 3)),
Ngrams(Children(m), "lemma", (1, 3)),
# The siblings of the mention
[LeftNgrams(LeftSiblings(m), a) for a in BASIC_ATTRIBS_REL],
[RightNgrams(RightSiblings(m), a) for a in BASIC_ATTRIBS_REL],
]
# return generator function
return Compile(temps).apply_mention
def _get_ddlib_feats(
span: SpanMention, context: Dict[str, Any], idxs: List[int]
) -> Iterator[str]:
"""Minimalist port of generic mention features from ddlib."""
if span.stable_id not in unary_ddlib_feats:
unary_ddlib_feats[span.stable_id] = set()
for seq_feat in _get_seq_features(context, idxs):
unary_ddlib_feats[span.stable_id].add(seq_feat)
for window_feat in _get_window_features(context, idxs):
unary_ddlib_feats[span.stable_id].add(window_feat)
for f in unary_ddlib_feats[span.stable_id]:
yield f
def _get_seq_features(context: Dict[str, Any], idxs: List[int]) -> Iterator[str]:
yield f"WORD_SEQ_[{' '.join(context['words'][i] for i in idxs)}]"
yield f"LEMMA_SEQ_[{' '.join(context['lemmas'][i] for i in idxs)}]"
yield f"POS_SEQ_[{' '.join(context['pos_tags'][i] for i in idxs)}]"
yield f"DEP_SEQ_[{' '.join(context['dep_labels'][i] for i in idxs)}]"
def _get_window_features(
context: Dict[str, Any],
idxs: List[int],
window: int = settings["featurization"]["textual"]["window_feature"]["size"],
combinations: bool = settings["featurization"]["textual"]["window_feature"][
"combinations"
],
isolated: bool = settings["featurization"]["textual"]["window_feature"]["isolated"],
) -> Iterator[str]:
left_lemmas = []
left_pos_tags = []
right_lemmas = []
right_pos_tags = []
try:
for i in range(1, window + 1):
lemma = context["lemmas"][idxs[0] - i]
try:
float(lemma)
lemma = "_NUMBER"
except ValueError:
pass
left_lemmas.append(lemma)
left_pos_tags.append(context["pos_tags"][idxs[0] - i])
except IndexError:
pass
left_lemmas.reverse()
left_pos_tags.reverse()
try:
for i in range(1, window + 1):
lemma = context["lemmas"][idxs[-1] + i]
try:
float(lemma)
lemma = "_NUMBER"
except ValueError:
pass
right_lemmas.append(lemma)
right_pos_tags.append(context["pos_tags"][idxs[-1] + i])
except IndexError:
pass
if isolated:
for i in range(len(left_lemmas)):
yield f"W_LEFT_{i + 1}_[{' '.join(left_lemmas[-i - 1 :])}]"
yield f"W_LEFT_POS_{i + 1}_[{' '.join(left_pos_tags[-i - 1 :])}]"
for i in range(len(right_lemmas)):
yield f"W_RIGHT_{i + 1}_[{' '.join(right_lemmas[: i + 1])}]"
yield f"W_RIGHT_POS_{i + 1}_[{' '.join(right_pos_tags[: i + 1])}]"
if combinations:
for i in range(len(left_lemmas)):
curr_left_lemmas = " ".join(left_lemmas[-i - 1 :])
try:
curr_left_pos_tags = " ".join(left_pos_tags[-i - 1 :])
except TypeError:
new_pos_tags = []
for pos in left_pos_tags[-i - 1 :]:
to_add = pos
if not to_add:
to_add = "None"
new_pos_tags.append(to_add)
curr_left_pos_tags = " ".join(new_pos_tags)
for j in range(len(right_lemmas)):
curr_right_lemmas = " ".join(right_lemmas[: j + 1])
try:
curr_right_pos_tags = " ".join(right_pos_tags[: j + 1])
except TypeError:
new_pos_tags = []
for pos in right_pos_tags[: j + 1]:
to_add = pos
if not to_add:
to_add = "None"
new_pos_tags.append(to_add)
curr_right_pos_tags = " ".join(new_pos_tags)
yield (
f"W_LEMMA_L_{i + 1}_R_{j + 1}_"
f"[{curr_left_lemmas}]_[{curr_right_lemmas}]"
)
yield (
f"W_POS_L_{i + 1}_R_{j + 1}_"
f"[{curr_left_pos_tags}]_[{curr_right_pos_tags}]"
)
def _get_word_feats(span: SpanMention) -> Iterator[str]:
attrib = "words"
if span.stable_id not in unary_word_feats:
unary_word_feats[span.stable_id] = set()
for ngram in tokens_to_ngrams(span.get_attrib_tokens(attrib), n_min=1, n_max=2):
feature = f"CONTAINS_{attrib.upper()}_[{ngram}]"
unary_word_feats[span.stable_id].add(feature)
for ngram in get_left_ngrams(
span,
window=settings["featurization"]["textual"]["word_feature"]["window"],
n_max=2,
attrib=attrib,
):
feature = f"LEFT_{attrib.upper()}_[{ngram}]"
unary_word_feats[span.stable_id].add(feature)
for ngram in get_right_ngrams(
span,
window=settings["featurization"]["textual"]["word_feature"]["window"],
n_max=2,
attrib=attrib,
):
feature = f"RIGHT_{attrib.upper()}_[{ngram}]"
unary_word_feats[span.stable_id].add(feature)
unary_word_feats[span.stable_id].add(
(
f"SPAN_TYPE_["
f"{'IMPLICIT' if isinstance(span, ImplicitSpanMention) else 'EXPLICIT'}"
f"]"
)
)
if span.get_span()[0].isupper():
unary_word_feats[span.stable_id].add("STARTS_WITH_CAPITAL")
unary_word_feats[span.stable_id].add(f"LENGTH_{span.get_num_words()}")
for f in unary_word_feats[span.stable_id]:
yield f