Source code for speechbrain.integrations.nlp.spacy_pipeline

"""Models and tooling for natural language processing using spaCy

Authors
* Sylvain de Langen 2024
"""

from typing import Iterable, Iterator, List, Union

import spacy
import spacy.tokens


def _as_sentence(sentence: Union[str, List[str]]):
    """Ensures that a sentence is a `str` rather than a list of `str` tokens to
    be passed to spaCy pipelines correctly.

    Arguments
    ---------
    sentence: str or list of str
        Sentence to return or list of tokens.

    Returns
    -------
    str
        The sentence, returned from the `sentence` argument as-is or joined with
        spaces from a list of tokens."""

    if isinstance(sentence, str):
        return sentence

    return " ".join(sentence)


def _extract_lemmas(docs: Iterable[spacy.tokens.Doc]):
    """Returns a batch of list of lemmas from a list of Doc (as returned by the
    pipeline).

    Arguments
    ---------
    docs: iterable of Doc
        Documents, typically as returned by `nlp.pipe`.

    Returns
    -------
    list of list of str
        For each sentence, the sequence of extracted lemmas as `str`s."""
    return [[tok.lemma_ for tok in doc] for doc in docs]



[docs]
class SpacyPipeline:
    """Wraps a `spaCy pipeline <https://spacy.io/usage/processing-pipelines>`_
    with methods that makes it easier to deal with SB's typical sentence format,
    and adds some convenience functions if you only care about a specific task.

    Arguments
    ---------
    nlp : spacy.language.Language
        spaCy text processing pipeline to use.

    Example
    -------
    >>> # NOTE: To run this example, you must first download a pipeline, e.g.
    >>> # spacy download en_core_web_sm
    >>> ler_model = SpacyPipeline.from_name(
    ...     name="en_core_web_sm", exclude=["parser", "ner", "textcat"]
    ... )
    >>> ler_model.lemmatize(["i", "am", "sitting"])
    [['I'], ['be'], ['sit']]
    """

    def __init__(self, nlp: spacy.language.Language):
        self.nlp = nlp


[docs]
    @staticmethod
    def from_name(name, *args, **kwargs):
        """Create a pipeline by loading a model using `spacy.load`.
        Unlike other toolkits, you must explicitly download the model if you
        want to use a remote model (e.g. `spacy download fr_core_news_md`)
        rather than just specifying a HF hub name.

        .. note::
            If you only need a subset of modules enabled in the pipeline,
            e.g. for lemmatization, consider
            `excluding <https://spacy.io/usage/processing-pipelines#disabling>_`
            using the `exclude=[...]` argument.

        Arguments
        ---------
        name: str | Path
            Package name or model path.
        *args
            Extra positional arguments passed to `spacy.load`.
        **kwargs
            Extra keyword arguments passed to `spacy.load`.

        Returns
        -------
        New SpacyPipeline
        """

        return SpacyPipeline(spacy.load(name, *args, **kwargs))



[docs]
    def __call__(
        self, inputs: Union[List[str], List[List[str]]]
    ) -> Iterator[spacy.tokens.Doc]:
        """Processes a batch of sentences into an iterator of spaCy documents.

        Arguments
        ---------
        inputs: list of sentences (str or list of tokens)
            Sentences to process, in the form of batches of lists of tokens
            (list of str) or a str.
            In the case of token lists, tokens do *not* need to be already
            tokenized for this specific sequence tagger, and they will be joined
            with spaces instead.

        Returns
        -------
        iterator of spacy.tokens.Doc
            Iterator of documents for the passed sentences."""

        return self.nlp.pipe(map(_as_sentence, inputs))



[docs]
    def lemmatize(
        self, inputs: Union[List[str], List[List[str]]]
    ) -> List[List[str]]:
        """Lemmatize a batch of sentences by processing the input sentences,
        discarding other irrelevant outputs.

        Arguments
        ---------
        inputs: list of sentences (str or list of tokens)
            Sentences to lemmatize, in the form of batches of lists of tokens
            (list of str) or a str.
            In the case of token lists, tokens do *not* need to be already
            tokenized for this specific sequence tagger, and they will be joined
            with spaces instead.

        Returns
        -------
        list of list of str
            For each sentence, the sequence of extracted lemmas as `str`s."""

        return _extract_lemmas(self(inputs))