Source code for speechbrain.integrations.nlp.spacy_pipeline

"""Models and tooling for natural language processing using spaCy

Authors
* Sylvain de Langen 2024
"""

from typing import Iterable, Iterator, List, Union

import spacy
import spacy.tokens


def _as_sentence(sentence: Union[str, List[str]]):
    """Ensures that a sentence is a `str` rather than a list of `str` tokens to
    be passed to spaCy pipelines correctly.

    Arguments
    ---------
    sentence: str or list of str
        Sentence to return or list of tokens.

    Returns
    -------
    str
        The sentence, returned from the `sentence` argument as-is or joined with
        spaces from a list of tokens."""

    if isinstance(sentence, str):
        return sentence

    return " ".join(sentence)


def _extract_lemmas(docs: Iterable[spacy.tokens.Doc]):
    """Returns a batch of list of lemmas from a list of Doc (as returned by the
    pipeline).

    Arguments
    ---------
    docs: iterable of Doc
        Documents, typically as returned by `nlp.pipe`.

    Returns
    -------
    list of list of str
        For each sentence, the sequence of extracted lemmas as `str`s."""
    return [[tok.lemma_ for tok in doc] for doc in docs]


[docs] class SpacyPipeline: """Wraps a `spaCy pipeline <https://spacy.io/usage/processing-pipelines>`_ with methods that makes it easier to deal with SB's typical sentence format, and adds some convenience functions if you only care about a specific task. Arguments --------- nlp : spacy.language.Language spaCy text processing pipeline to use. Example ------- >>> # NOTE: To run this example, you must first download a pipeline, e.g. >>> # spacy download en_core_web_sm >>> ler_model = SpacyPipeline.from_name( ... name="en_core_web_sm", exclude=["parser", "ner", "textcat"] ... ) >>> ler_model.lemmatize(["i", "am", "sitting"]) [['I'], ['be'], ['sit']] """ def __init__(self, nlp: spacy.language.Language): self.nlp = nlp
[docs] @staticmethod def from_name(name, *args, **kwargs): """Create a pipeline by loading a model using `spacy.load`. Unlike other toolkits, you must explicitly download the model if you want to use a remote model (e.g. `spacy download fr_core_news_md`) rather than just specifying a HF hub name. .. note:: If you only need a subset of modules enabled in the pipeline, e.g. for lemmatization, consider `excluding <https://spacy.io/usage/processing-pipelines#disabling>_` using the `exclude=[...]` argument. Arguments --------- name: str | Path Package name or model path. *args Extra positional arguments passed to `spacy.load`. **kwargs Extra keyword arguments passed to `spacy.load`. Returns ------- New SpacyPipeline """ return SpacyPipeline(spacy.load(name, *args, **kwargs))
[docs] def __call__( self, inputs: Union[List[str], List[List[str]]] ) -> Iterator[spacy.tokens.Doc]: """Processes a batch of sentences into an iterator of spaCy documents. Arguments --------- inputs: list of sentences (str or list of tokens) Sentences to process, in the form of batches of lists of tokens (list of str) or a str. In the case of token lists, tokens do *not* need to be already tokenized for this specific sequence tagger, and they will be joined with spaces instead. Returns ------- iterator of spacy.tokens.Doc Iterator of documents for the passed sentences.""" return self.nlp.pipe(map(_as_sentence, inputs))
[docs] def lemmatize( self, inputs: Union[List[str], List[List[str]]] ) -> List[List[str]]: """Lemmatize a batch of sentences by processing the input sentences, discarding other irrelevant outputs. Arguments --------- inputs: list of sentences (str or list of tokens) Sentences to lemmatize, in the form of batches of lists of tokens (list of str) or a str. In the case of token lists, tokens do *not* need to be already tokenized for this specific sequence tagger, and they will be joined with spaces instead. Returns ------- list of list of str For each sentence, the sequence of extracted lemmas as `str`s.""" return _extract_lemmas(self(inputs))