Source code for fonduer.features.feature_extractors
"""Fonduer feature extractor."""
from typing import Callable, Dict, Iterator, List, Tuple, Union
from fonduer.candidates.models import Candidate
from fonduer.features.feature_libs.structural_features import (
extract_structural_features,
)
from fonduer.features.feature_libs.tabular_features import extract_tabular_features
from fonduer.features.feature_libs.textual_features import extract_textual_features
from fonduer.features.feature_libs.visual_features import extract_visual_features
FEATURES: Dict[str, Callable[[List[Candidate]], Iterator[Tuple[int, str, int]]]] = {
"textual": extract_textual_features,
"structural": extract_structural_features,
"tabular": extract_tabular_features,
"visual": extract_visual_features,
}
# Type alias for feature_func
Feature_func = Callable[[List[Candidate]], Iterator[Tuple[int, str, int]]]
[docs]class FeatureExtractor(object):
"""A class to extract features from candidates.
:param features: a list of which Fonduer feature types to extract, defaults
to ["textual", "structural", "tabular", "visual"]
:param customize_feature_funcs: a list of customized feature extractors where the
extractor takes a list of candidates as input and yield tuples
of (candidate_id, feature, value), defaults to []
"""
def __init__(
self,
features: List[str] = ["textual", "structural", "tabular", "visual"],
customize_feature_funcs: Union[Feature_func, List[Feature_func]] = [],
) -> None:
"""Initialize FeatureExtractor."""
if not isinstance(customize_feature_funcs, list):
customize_feature_funcs = [customize_feature_funcs]
self.feature_extractors: List[
Callable[[List[Candidate]], Iterator[Tuple[int, str, int]]]
] = []
for feature in features:
if feature not in FEATURES:
raise ValueError(f"Unrecognized feature type: {feature}")
self.feature_extractors.append(FEATURES[feature])
self.feature_extractors.extend(customize_feature_funcs)
[docs] def extract(
self, candidates: Union[List[Candidate], Candidate]
) -> Iterator[Tuple[int, str, int]]:
"""Extract features from candidates.
:param candidates: A list of candidates to extract features from
"""
candidates = candidates if isinstance(candidates, list) else [candidates]
for feature_extractor in self.feature_extractors:
for candidate_id, feature, value in feature_extractor(candidates):
yield candidate_id, feature, value