Source code for fonduer.features.feature_libs.structural_features

"""Fonduer structural feature extractor."""
from typing import Dict, Iterator, List, Set, Tuple, Union

from fonduer.candidates.models import Candidate
from fonduer.candidates.models.span_mention import SpanMention, TemporarySpanMention
from fonduer.utils.data_model_utils import (
    common_ancestor,
    get_ancestor_class_names,
    get_ancestor_id_names,
    get_ancestor_tag_names,
    get_attributes,
    get_next_sibling_tags,
    get_parent_tag,
    get_prev_sibling_tags,
    get_tag,
    lowest_common_ancestor_depth,
)

FEATURE_PREFIX = "STR_"
DEF_VALUE = 1

unary_strlib_feats: Dict[str, Set[Tuple[str, int]]] = {}
multinary_strlib_feats: Dict[str, Set[Tuple[str, int]]] = {}


[docs]def extract_structural_features( candidates: Union[Candidate, List[Candidate]], ) -> Iterator[Tuple[int, str, int]]: """Extract structural features. :param candidates: A list of candidates to extract features from """ candidates = candidates if isinstance(candidates, list) else [candidates] for candidate in candidates: args = tuple([m.context for m in candidate.get_mentions()]) if any(not (isinstance(arg, TemporarySpanMention)) for arg in args): raise ValueError( f"Structural feature only accepts Span-type arguments, " f"{type(candidate)}-type found." ) # Unary candidates if len(args) == 1: span = args[0] if span.sentence.is_structural(): if span.stable_id not in unary_strlib_feats: unary_strlib_feats[span.stable_id] = set() for feature, value in _strlib_unary_features(span): unary_strlib_feats[span.stable_id].add((feature, value)) for feature, value in unary_strlib_feats[span.stable_id]: yield candidate.id, FEATURE_PREFIX + feature, value # Multinary candidates else: spans = args if all([span.sentence.is_structural() for span in spans]): for i, span in enumerate(spans): prefix = f"e{i}_" if span.stable_id not in unary_strlib_feats: unary_strlib_feats[span.stable_id] = set() for feature, value in _strlib_unary_features(span): unary_strlib_feats[span.stable_id].add((feature, value)) for feature, value in unary_strlib_feats[span.stable_id]: yield candidate.id, FEATURE_PREFIX + prefix + feature, value if candidate.id not in multinary_strlib_feats: multinary_strlib_feats[candidate.id] = set() for feature, value in _strlib_multinary_features(spans): multinary_strlib_feats[candidate.id].add((feature, value)) for feature, value in multinary_strlib_feats[candidate.id]: yield candidate.id, FEATURE_PREFIX + feature, value
def _strlib_unary_features(span: SpanMention) -> Iterator[Tuple[str, int]]: """Structural-related features for a single span.""" if not span.sentence.is_structural(): return yield f"TAG_{get_tag(span)}", DEF_VALUE for attr in get_attributes(span): yield f"HTML_ATTR_{attr}", DEF_VALUE yield f"PARENT_TAG_{get_parent_tag(span)}", DEF_VALUE prev_tags = get_prev_sibling_tags(span) if len(prev_tags): yield f"PREV_SIB_TAG_{prev_tags[-1]}", DEF_VALUE yield f"NODE_POS_{len(prev_tags) + 1}", DEF_VALUE else: yield "FIRST_NODE", DEF_VALUE next_tags = get_next_sibling_tags(span) if len(next_tags): yield f"NEXT_SIB_TAG_{next_tags[0]}", DEF_VALUE else: yield "LAST_NODE", DEF_VALUE yield f"ANCESTOR_CLASS_[{' '.join(get_ancestor_class_names(span))}]", DEF_VALUE yield f"ANCESTOR_TAG_[{' '.join(get_ancestor_tag_names(span))}]", DEF_VALUE yield f"ANCESTOR_ID_[{' '.join(get_ancestor_id_names(span))}]", DEF_VALUE def _strlib_multinary_features( spans: Tuple[SpanMention, ...] ) -> Iterator[Tuple[str, int]]: """Structural-related features for multiple spans.""" yield f"COMMON_ANCESTOR_[{' '.join(common_ancestor(spans))}]", DEF_VALUE yield ( f"LOWEST_ANCESTOR_DEPTH_[" f"{lowest_common_ancestor_depth(spans)}]" ), DEF_VALUE