Source code for fonduer.learning.dataset

"""Fonduer dataset."""
import logging
from typing import Any, Dict, List, Optional, Tuple, Union

import numpy as np
import torch
from import EmmentalDataset
from scipy.sparse import csr_matrix
from torch import Tensor

from fonduer.candidates.models import Candidate
from fonduer.learning.utils import mark_sentence, mention_to_tokens

logger = logging.getLogger(__name__)

[docs]class FonduerDataset(EmmentalDataset): """A FonduerDataset class which is inherited from EmmentalDataset. This class takes list of candidates and corresponding feature matrix as input and wraps them. :param name: The name of the dataset. :param candidates: The list of candidates. :param features: The corresponding feature matrix. :param word2id: The name of the dataset. :param labels: If np.array, it's the label for all candidates; If int, it's the number of classes of label and we will create placeholder labels (mainly used for inference). :param labels: Which candidates to use. If None, use all candidates. """ def __init__( self, name: str, candidates: List[Candidate], features: csr_matrix, word2id: Dict, labels: Union[np.array, int], index: Optional[List[int]] = None, ): """Initialize FonduerDataset.""" = name self.candidates = candidates self.features = features self.word2id = word2id self.labels = labels self.index = index self.X_dict: Dict[str, List[Any]] = {} self.Y_dict: Dict[str, Tensor] = {} self._map_to_id() self._map_features() self._map_labels() uids = [f"{}_{idx}" for idx in range(len(self.candidates))] self.add_features({"_uids_": uids}) super().__init__(name, self.X_dict, self.Y_dict, "_uids_") def __len__(self) -> int: """Get the length of the dataset.""" try: if self.index is not None: return len(self.index) else: return len(next(iter(self.X_dict.values()))) except StopIteration: return 0 def __getitem__( self, index: int ) -> Tuple[Dict[str, Union[Tensor, list]], Dict[str, Tensor]]: """Get the data from dataset.""" if self.index is not None: index = self.index[index] x_dict = {name: feature[index] for name, feature in self.X_dict.items()} y_dict = {name: label[index] for name, label in self.Y_dict.items()} return x_dict, y_dict def _map_to_id(self) -> None: self.X_dict.update( dict([(f"m{i}", []) for i in range(len(self.candidates[0]))]) ) for candidate in self.candidates: for i in range(len(candidate)): # Add mark for each mention in the original sentence args = [ ( candidate[i].context.get_word_start_index(), candidate[i].context.get_word_end_index(), i, ) ] s = mark_sentence(mention_to_tokens(candidate[i]), args) self.X_dict[f"m{i}"].append( torch.tensor( [ self.word2id[w] if w in self.word2id else self.word2id["<unk>"] for w in s ], dtype=torch.long, ) ) def _map_features(self) -> None: self.X_dict.update({"feature_index": [], "feature_weight": []}) for i in range(len(self.candidates)): self.X_dict["feature_index"].append( torch.tensor( self.features.indices[ self.features.indptr[i] : self.features.indptr[i + 1] ], dtype=torch.long, ) + 1 ) self.X_dict["feature_weight"].append( torch.tensor([ self.features.indptr[i] : self.features.indptr[i + 1] ], dtype=torch.float, ) ) def _map_labels(self) -> None: if isinstance(self.labels, int): self.Y_dict.update( { "labels": torch.from_numpy( np.random.randint(self.labels, size=len(self.candidates)) ) } ) else: self.Y_dict.update({"labels": torch.tensor(np.array(self.labels))})