Source code for fonduer.features.feature_libs.textual_features

from builtins import range
from typing import Any, Callable, Dict, Iterator, List, Set, Tuple

from treedlib import (
    Children,
    Compile,
    Indicator,
    LeftNgrams,
    LeftSiblings,
    Mention,
    Ngrams,
    Parents,
    RightNgrams,
    RightSiblings,
    compile_relation_feature_generator,
)

from fonduer.candidates.models import Candidate, ImplicitSpanMention
from fonduer.candidates.models.span_mention import SpanMention, TemporarySpanMention
from fonduer.features.feature_libs.tree_structs import corenlp_to_xmltree
from fonduer.utils.config import get_config
from fonduer.utils.data_model_utils import get_left_ngrams, get_right_ngrams
from fonduer.utils.utils import get_as_dict, tokens_to_ngrams

DEF_VALUE = 1

unary_ddlib_feats: Dict[str, Set] = {}
unary_word_feats: Dict[str, Set] = {}
unary_tdl_feats: Dict[str, Set] = {}
binary_tdl_feats: Dict[str, Set] = {}
settings = get_config()


[docs]def extract_textual_features( candidates: List[Candidate] ) -> Iterator[Tuple[int, str, int]]: """Extract textual features. :param candidates: A list of candidates to extract features from :type candidates: list """ candidates = candidates if isinstance(candidates, list) else [candidates] for candidate in candidates: args = tuple([m.context for m in candidate.get_mentions()]) if not (isinstance(args[0], TemporarySpanMention)): raise ValueError( f"Accepts Span-type arguments, {type(candidate)}-type found." ) # Unary candidates if len(args) == 1: span: SpanMention = args[0] if span.sentence.is_lingual(): get_tdl_feats = _compile_entity_feature_generator() xmltree = corenlp_to_xmltree(span.sentence) sidxs = list( range(span.get_word_start_index(), span.get_word_end_index() + 1) ) if len(sidxs) > 0: # Add DDLIB entity features for f in _get_ddlib_feats(span, get_as_dict(span.sentence), sidxs): yield candidate.id, f"DDL_{f}", DEF_VALUE # Add TreeDLib entity features if span.stable_id not in unary_tdl_feats: unary_tdl_feats[span.stable_id] = set() for f in get_tdl_feats(xmltree.root, sidxs): unary_tdl_feats[span.stable_id].add(f) for f in unary_tdl_feats[span.stable_id]: yield candidate.id, f"TDL_{f}", DEF_VALUE for f in _get_word_feats(span): yield candidate.id, f"BASIC_{f}", DEF_VALUE # Binary candidates elif len(args) == 2: span1, span2 = args if span1.sentence.is_lingual() and span2.sentence.is_lingual(): get_tdl_feats = compile_relation_feature_generator() sent1 = get_as_dict(span1.sentence) sent2 = get_as_dict(span2.sentence) xmltree = corenlp_to_xmltree(span1.sentence) s1_idxs = list( range(span1.get_word_start_index(), span1.get_word_end_index() + 1) ) s2_idxs = list( range(span2.get_word_start_index(), span2.get_word_end_index() + 1) ) if len(s1_idxs) > 0 and len(s2_idxs) > 0: # Add DDLIB entity features for relation for f in _get_ddlib_feats(span1, sent1, s1_idxs): yield candidate.id, f"DDL_e1_{f}", DEF_VALUE for f in _get_ddlib_feats(span2, sent2, s2_idxs): yield candidate.id, f"DDL_e2_{f}", DEF_VALUE # Add TreeDLib relation features if candidate.id not in binary_tdl_feats: binary_tdl_feats[candidate.id] = set() for f in get_tdl_feats(xmltree.root, s1_idxs, s2_idxs): binary_tdl_feats[candidate.id].add(f) for f in binary_tdl_feats[candidate.id]: yield candidate.id, f"TDL_{f}", DEF_VALUE for f in _get_word_feats(span1): yield candidate.id, f"BASIC_e1_{f}", DEF_VALUE for f in _get_word_feats(span2): yield candidate.id, f"BASIC_e2_{f}", DEF_VALUE else: raise NotImplementedError( "Only handles unary and binary candidates currently" )
def _compile_entity_feature_generator() -> Callable: """ Given optional arguments, returns a generator function which accepts an xml root and a list of indexes for a mention, and will generate relation features for this entity. """ BASIC_ATTRIBS_REL = ["lemma", "dep_label"] m = Mention(0) # Basic relation feature templates temps = [ [Indicator(m, a) for a in BASIC_ATTRIBS_REL], Indicator(m, "dep_label,lemma"), # The *first element on the* path to the root: ngram lemmas along it Ngrams(Parents(m, 3), "lemma", (1, 3)), Ngrams(Children(m), "lemma", (1, 3)), # The siblings of the mention [LeftNgrams(LeftSiblings(m), a) for a in BASIC_ATTRIBS_REL], [RightNgrams(RightSiblings(m), a) for a in BASIC_ATTRIBS_REL], ] # return generator function return Compile(temps).apply_mention def _get_ddlib_feats( span: SpanMention, context: Dict[str, Any], idxs: List[int] ) -> Iterator[str]: """ Minimalist port of generic mention features from ddlib """ if span.stable_id not in unary_ddlib_feats: unary_ddlib_feats[span.stable_id] = set() for seq_feat in _get_seq_features(context, idxs): unary_ddlib_feats[span.stable_id].add(seq_feat) for window_feat in _get_window_features(context, idxs): unary_ddlib_feats[span.stable_id].add(window_feat) for f in unary_ddlib_feats[span.stable_id]: yield f def _get_seq_features(context: Dict[str, Any], idxs: List[int]) -> Iterator[str]: yield f"WORD_SEQ_[{' '.join(context['words'][i] for i in idxs)}]" yield f"LEMMA_SEQ_[{' '.join(context['lemmas'][i] for i in idxs)}]" yield f"POS_SEQ_[{' '.join(context['pos_tags'][i] for i in idxs)}]" yield f"DEP_SEQ_[{' '.join(context['dep_labels'][i] for i in idxs)}]" def _get_window_features( context: Dict[str, Any], idxs: List[int], window: int = settings["featurization"]["textual"]["window_feature"]["size"], combinations: bool = settings["featurization"]["textual"]["window_feature"][ "combinations" ], isolated: bool = settings["featurization"]["textual"]["window_feature"]["isolated"], ) -> Iterator[str]: left_lemmas = [] left_pos_tags = [] right_lemmas = [] right_pos_tags = [] try: for i in range(1, window + 1): lemma = context["lemmas"][idxs[0] - i] try: float(lemma) lemma = "_NUMBER" except ValueError: pass left_lemmas.append(lemma) left_pos_tags.append(context["pos_tags"][idxs[0] - i]) except IndexError: pass left_lemmas.reverse() left_pos_tags.reverse() try: for i in range(1, window + 1): lemma = context["lemmas"][idxs[-1] + i] try: float(lemma) lemma = "_NUMBER" except ValueError: pass right_lemmas.append(lemma) right_pos_tags.append(context["pos_tags"][idxs[-1] + i]) except IndexError: pass if isolated: for i in range(len(left_lemmas)): yield f"W_LEFT_{i + 1}_[{' '.join(left_lemmas[-i - 1 :])}]" yield f"W_LEFT_POS_{i + 1}_[{' '.join(left_pos_tags[-i - 1 :])}]" for i in range(len(right_lemmas)): yield f"W_RIGHT_{i + 1}_[{' '.join(right_lemmas[: i + 1])}]" yield f"W_RIGHT_POS_{i + 1}_[{' '.join(right_pos_tags[: i + 1])}]" if combinations: for i in range(len(left_lemmas)): curr_left_lemmas = " ".join(left_lemmas[-i - 1 :]) try: curr_left_pos_tags = " ".join(left_pos_tags[-i - 1 :]) except TypeError: new_pos_tags = [] for pos in left_pos_tags[-i - 1 :]: to_add = pos if not to_add: to_add = "None" new_pos_tags.append(to_add) curr_left_pos_tags = " ".join(new_pos_tags) for j in range(len(right_lemmas)): curr_right_lemmas = " ".join(right_lemmas[: j + 1]) try: curr_right_pos_tags = " ".join(right_pos_tags[: j + 1]) except TypeError: new_pos_tags = [] for pos in right_pos_tags[: j + 1]: to_add = pos if not to_add: to_add = "None" new_pos_tags.append(to_add) curr_right_pos_tags = " ".join(new_pos_tags) yield ( f"W_LEMMA_L_{i + 1}_R_{j + 1}_" f"[{curr_left_lemmas}]_[{curr_right_lemmas}]" ) yield ( f"W_POS_L_{i + 1}_R_{j + 1}_" f"[{curr_left_pos_tags}]_[{curr_right_pos_tags}]" ) def _get_word_feats(span: SpanMention) -> Iterator[str]: attrib = "words" if span.stable_id not in unary_word_feats: unary_word_feats[span.stable_id] = set() for ngram in tokens_to_ngrams(span.get_attrib_tokens(attrib), n_min=1, n_max=2): feature = f"CONTAINS_{attrib.upper()}_[{ngram}]" unary_word_feats[span.stable_id].add(feature) for ngram in get_left_ngrams( span, window=settings["featurization"]["textual"]["word_feature"]["window"], n_max=2, attrib=attrib, ): feature = f"LEFT_{attrib.upper()}_[{ngram}]" unary_word_feats[span.stable_id].add(feature) for ngram in get_right_ngrams( span, window=settings["featurization"]["textual"]["word_feature"]["window"], n_max=2, attrib=attrib, ): feature = f"RIGHT_{attrib.upper()}_[{ngram}]" unary_word_feats[span.stable_id].add(feature) unary_word_feats[span.stable_id].add( ( f"SPAN_TYPE_[" f"{'IMPLICIT' if isinstance(span, ImplicitSpanMention) else 'EXPLICIT'}" f"]" ) ) if span.get_span()[0].isupper(): unary_word_feats[span.stable_id].add("STARTS_WITH_CAPITAL") unary_word_feats[span.stable_id].add(f"LENGTH_{span.get_num_words()}") for f in unary_word_feats[span.stable_id]: yield f