Source code for fonduer.features.feature_libs.textual_features

"""Fonduer textual feature extractor."""
from builtins import range
from typing import Any, Callable, Dict, Iterator, List, Set, Tuple, Union

from treedlib import (
    Children,
    Compile,
    Indicator,
    LeftNgrams,
    LeftSiblings,
    Mention,
    Ngrams,
    Parents,
    RightNgrams,
    RightSiblings,
    compile_relation_feature_generator,
)

from fonduer.candidates.models import Candidate, ImplicitSpanMention, SpanMention
from fonduer.features.feature_libs.tree_structs import corenlp_to_xmltree
from fonduer.utils.config import get_config
from fonduer.utils.data_model_utils import get_left_ngrams, get_right_ngrams
from fonduer.utils.utils import get_as_dict, tokens_to_ngrams

DEF_VALUE = 1

unary_ddlib_feats: Dict[str, Set] = {}
unary_word_feats: Dict[str, Set] = {}
unary_tdl_feats: Dict[str, Set] = {}
multinary_tdl_feats: Dict[str, Set] = {}
settings = get_config()


[docs]def extract_textual_features(
    candidates: Union[Candidate, List[Candidate]],
) -> Iterator[Tuple[int, str, int]]:
    """Extract textual features.

    :param candidates: A list of candidates to extract features from
    """
    candidates = candidates if isinstance(candidates, list) else [candidates]
    for candidate in candidates:
        args = tuple([m.context for m in candidate.get_mentions()])
        if not (isinstance(args[0], (SpanMention, ImplicitSpanMention))):
            raise ValueError(
                f"Accepts Span/ImplicitSpan-type mentions, {type(args[0])}-type found."
            )

        # Unary candidates
        if len(args) == 1:
            span: Union[SpanMention, ImplicitSpanMention] = args[0]
            if span.sentence.is_lingual():
                get_tdl_feats = _compile_entity_feature_generator()
                xmltree = corenlp_to_xmltree(span.sentence)
                sidxs = list(
                    range(span.get_word_start_index(), span.get_word_end_index() + 1)
                )
                if len(sidxs) > 0:
                    # Add DDLIB entity features
                    for f in _get_ddlib_feats(span, get_as_dict(span.sentence), sidxs):
                        yield candidate.id, f"DDL_{f}", DEF_VALUE
                    # Add TreeDLib entity features
                    if span.stable_id not in unary_tdl_feats:
                        unary_tdl_feats[span.stable_id] = set()
                        for f in get_tdl_feats(xmltree.root, sidxs):
                            unary_tdl_feats[span.stable_id].add(f)
                    for f in unary_tdl_feats[span.stable_id]:
                        yield candidate.id, f"TDL_{f}", DEF_VALUE
            for f in _get_word_feats(span):
                yield candidate.id, f"BASIC_{f}", DEF_VALUE

        # Multinary candidates
        else:
            spans = args
            if all([span.sentence.is_lingual() for span in spans]):
                get_tdl_feats = compile_relation_feature_generator(is_multary=True)
                sents = [get_as_dict(span.sentence) for span in spans]
                xmltree = corenlp_to_xmltree(spans[0].sentence)
                s_idxs = [
                    list(
                        range(
                            span.get_word_start_index(), span.get_word_end_index() + 1
                        )
                    )
                    for span in spans
                ]
                if all([len(s_idx) > 0 for s_idx in s_idxs]):

                    # Add DDLIB entity features for relation
                    for span, sent, s_idx, i in zip(
                        spans, sents, s_idxs, range(len(spans))
                    ):

                        for f in _get_ddlib_feats(span, sent, s_idx):
                            yield candidate.id, f"DDL_e{i}_{f}", DEF_VALUE

                    # Add TreeDLib relation features
                    if candidate.id not in multinary_tdl_feats:
                        multinary_tdl_feats[candidate.id] = set()
                        for f in get_tdl_feats(xmltree.root, s_idxs):
                            multinary_tdl_feats[candidate.id].add(f)
                    for f in multinary_tdl_feats[candidate.id]:
                        yield candidate.id, f"TDL_{f}", DEF_VALUE
            for i, span in enumerate(spans):
                for f in _get_word_feats(span):
                    yield candidate.id, f"BASIC_e{i}_{f}", DEF_VALUE


def _compile_entity_feature_generator() -> Callable:
    """Compile entity feature generator.

    Given optional arguments, returns a generator function which accepts an xml
    root and a list of indexes for a mention, and will generate relation
    features for this entity.
    """
    BASIC_ATTRIBS_REL = ["lemma", "dep_label"]

    m = Mention(0)

    # Basic relation feature templates
    temps = [
        [Indicator(m, a) for a in BASIC_ATTRIBS_REL],
        Indicator(m, "dep_label,lemma"),
        # The *first element on the* path to the root: ngram lemmas along it
        Ngrams(Parents(m, 3), "lemma", (1, 3)),
        Ngrams(Children(m), "lemma", (1, 3)),
        # The siblings of the mention
        [LeftNgrams(LeftSiblings(m), a) for a in BASIC_ATTRIBS_REL],
        [RightNgrams(RightSiblings(m), a) for a in BASIC_ATTRIBS_REL],
    ]

    # return generator function
    return Compile(temps).apply_mention


def _get_ddlib_feats(
    span: SpanMention, context: Dict[str, Any], idxs: List[int]
) -> Iterator[str]:
    """Minimalist port of generic mention features from ddlib."""
    if span.stable_id not in unary_ddlib_feats:
        unary_ddlib_feats[span.stable_id] = set()

        for seq_feat in _get_seq_features(context, idxs):
            unary_ddlib_feats[span.stable_id].add(seq_feat)

        for window_feat in _get_window_features(context, idxs):
            unary_ddlib_feats[span.stable_id].add(window_feat)

    for f in unary_ddlib_feats[span.stable_id]:
        yield f


def _get_seq_features(context: Dict[str, Any], idxs: List[int]) -> Iterator[str]:
    yield f"WORD_SEQ_[{' '.join(context['words'][i] for i in idxs)}]"
    yield f"LEMMA_SEQ_[{' '.join(context['lemmas'][i] for i in idxs)}]"
    yield f"POS_SEQ_[{' '.join(context['pos_tags'][i] for i in idxs)}]"
    yield f"DEP_SEQ_[{' '.join(context['dep_labels'][i] for i in idxs)}]"


def _get_window_features(
    context: Dict[str, Any],
    idxs: List[int],
    window: int = settings["featurization"]["textual"]["window_feature"]["size"],
    combinations: bool = settings["featurization"]["textual"]["window_feature"][
        "combinations"
    ],
    isolated: bool = settings["featurization"]["textual"]["window_feature"]["isolated"],
) -> Iterator[str]:
    left_lemmas = []
    left_pos_tags = []
    right_lemmas = []
    right_pos_tags = []
    try:
        for i in range(1, window + 1):
            lemma = context["lemmas"][idxs[0] - i]
            try:
                float(lemma)
                lemma = "_NUMBER"
            except ValueError:
                pass
            left_lemmas.append(lemma)
            left_pos_tags.append(context["pos_tags"][idxs[0] - i])
    except IndexError:
        pass
    left_lemmas.reverse()
    left_pos_tags.reverse()
    try:
        for i in range(1, window + 1):
            lemma = context["lemmas"][idxs[-1] + i]
            try:
                float(lemma)
                lemma = "_NUMBER"
            except ValueError:
                pass
            right_lemmas.append(lemma)
            right_pos_tags.append(context["pos_tags"][idxs[-1] + i])
    except IndexError:
        pass
    if isolated:
        for i in range(len(left_lemmas)):
            yield f"W_LEFT_{i + 1}_[{' '.join(left_lemmas[-i - 1 :])}]"
            yield f"W_LEFT_POS_{i + 1}_[{' '.join(left_pos_tags[-i - 1 :])}]"
        for i in range(len(right_lemmas)):
            yield f"W_RIGHT_{i + 1}_[{' '.join(right_lemmas[: i + 1])}]"
            yield f"W_RIGHT_POS_{i + 1}_[{' '.join(right_pos_tags[: i + 1])}]"
    if combinations:
        for i in range(len(left_lemmas)):
            curr_left_lemmas = " ".join(left_lemmas[-i - 1 :])
            try:
                curr_left_pos_tags = " ".join(left_pos_tags[-i - 1 :])
            except TypeError:
                new_pos_tags = []
                for pos in left_pos_tags[-i - 1 :]:
                    to_add = pos
                    if not to_add:
                        to_add = "None"
                    new_pos_tags.append(to_add)
                curr_left_pos_tags = " ".join(new_pos_tags)
            for j in range(len(right_lemmas)):
                curr_right_lemmas = " ".join(right_lemmas[: j + 1])
                try:
                    curr_right_pos_tags = " ".join(right_pos_tags[: j + 1])
                except TypeError:
                    new_pos_tags = []
                    for pos in right_pos_tags[: j + 1]:
                        to_add = pos
                        if not to_add:
                            to_add = "None"
                        new_pos_tags.append(to_add)
                    curr_right_pos_tags = " ".join(new_pos_tags)
                yield (
                    f"W_LEMMA_L_{i + 1}_R_{j + 1}_"
                    f"[{curr_left_lemmas}]_[{curr_right_lemmas}]"
                )
                yield (
                    f"W_POS_L_{i + 1}_R_{j + 1}_"
                    f"[{curr_left_pos_tags}]_[{curr_right_pos_tags}]"
                )


def _get_word_feats(span: SpanMention) -> Iterator[str]:
    attrib = "words"

    if span.stable_id not in unary_word_feats:
        unary_word_feats[span.stable_id] = set()

        for ngram in tokens_to_ngrams(span.get_attrib_tokens(attrib), n_min=1, n_max=2):
            feature = f"CONTAINS_{attrib.upper()}_[{ngram}]"
            unary_word_feats[span.stable_id].add(feature)

        for ngram in get_left_ngrams(
            span,
            window=settings["featurization"]["textual"]["word_feature"]["window"],
            n_max=2,
            attrib=attrib,
        ):
            feature = f"LEFT_{attrib.upper()}_[{ngram}]"
            unary_word_feats[span.stable_id].add(feature)

        for ngram in get_right_ngrams(
            span,
            window=settings["featurization"]["textual"]["word_feature"]["window"],
            n_max=2,
            attrib=attrib,
        ):
            feature = f"RIGHT_{attrib.upper()}_[{ngram}]"
            unary_word_feats[span.stable_id].add(feature)

        unary_word_feats[span.stable_id].add(
            (
                f"SPAN_TYPE_["
                f"{'IMPLICIT' if isinstance(span, ImplicitSpanMention) else 'EXPLICIT'}"
                f"]"
            )
        )

        if span.get_span()[0].isupper():
            unary_word_feats[span.stable_id].add("STARTS_WITH_CAPITAL")

        unary_word_feats[span.stable_id].add(f"LENGTH_{span.get_num_words()}")

    for f in unary_word_feats[span.stable_id]:
        yield f