Source code for cltk.lemmatize.backoff

"""Lemmatization module—includes several classes for different
lemmatizing approaches--based on training data, regex pattern matching,
etc. These can be chained together using the backoff parameter. Also,
includes a pre-built chain that uses models in cltk_data.

The logic behind the backoff lemmatizer is based on backoff POS-tagging in
NLTK and repurposes several of the tagging classes for lemmatization
tasks. See here for more info on sequential backoff tagging in NLTK:
http://www.nltk.org/_modules/nltk/tag/sequential.html

PJB: The Latin lemmatizer modules were completed as part of Google Summer of Code
2016. I have written up a detailed report of the summer work here:
https://gist.github.com/diyclassics/fc80024d65cc237f185a9a061c5d4824.
"""

import re
import reprlib
from typing import Optional

from nltk.tag.sequential import (
    RegexpTagger,
    SequentialBackoffTagger,
    TaggerI,
    UnigramTagger,
)


[docs]class SequentialBackoffLemmatizer(SequentialBackoffTagger):
    """Abstract base class for lemmatizers created as a subclass of
    NLTK's SequentialBackoffTagger. Lemmatizers in this class "[tag]
    words sequentially, left to right. Tagging of individual words is
    performed by the ``choose_tag()`` method, which should be defined
    by subclasses.  If a tagger is unable to determine a tag for the
    specified token, then its backoff tagger is consulted."

    See: https://www.nltk.org/_modules/nltk/tag/sequential.html#SequentialBackoffTagger

    :type _taggers: list
    :ivar _taggers: A list of all the taggers in the backoff chain,
        inc. self.
    :type _repr: Repr object
    :ivar _repr: An instance of Repr() from reprlib to handle list
        and dict length in subclass __repr__'s
    """

    # def choose_tag(self, tokens, index, history):
    #     pass

    def __init__(
        self, backoff: Optional[SequentialBackoffTagger], verbose: bool = False
    ):
        """Setup for SequentialBackoffLemmatizer
        :param backoff: Next lemmatizer in backoff chain
        :type verbose: bool
        :param verbose: Flag to include which lemmatizer assigned in
            a given tag in the return tuple
        """
        SequentialBackoffTagger.__init__(self, backoff=None)

        # Setup backoff chain
        if backoff is None:
            self._taggers = [self]
        else:
            self._taggers = [self] + backoff._taggers

        self.VERBOSE = verbose
        self.repr = reprlib.Repr()
        self.repr.maxlist = 1
        self.repr.maxdict = 1

[docs]    def tag(self, tokens: list[str]):
        """Docs (mostly) inherited from TaggerI; cf.
        https://www.nltk.org/_modules/nltk/tag/api.html#TaggerI.tag

        Two tweaks:
        1. Properly handle 'verbose' listing of current tagger in
        the case of None (i.e. ``if tag: etc.``)
        2. Keep track of taggers and change return depending on
        'verbose' flag

        :rtype list
        :type tokens: list
        :param tokens: List of tokens to tag
        """
        tags = []
        taggers = []
        for i in range(len(tokens)):
            tag, tagger = self.tag_one(tokens, i, tags)
            tags.append(tag)
            taggers.append(str(tagger)) if tag else taggers.append(None)

        if self.VERBOSE:
            return list(zip(tokens, tags, taggers))
        else:
            return list(zip(tokens, tags))

[docs]    def tag_one(self, tokens: list[str], index: int, history: list[str]) -> tuple:
        """Determine an appropriate tag for the specified token, and
        return that tag.  If this tagger is unable to determine a tag
        for the specified token, then its backoff tagger is consulted.

        :rtype: tuple
        :type tokens: list
        :param tokens: The list of words that are being tagged.
        :type index: int
        :param index: The index of the word whose tag should be
            returned.
        :type history: list(str)
        :param history: A list of the tags for all words before index.
        """
        lemma = None
        tagger = None
        for _tagger in self._taggers:
            lemma = _tagger.choose_tag(tokens, index, history)
            tagger = _tagger
            if lemma is not None and lemma != "":
                break
        return lemma, tagger

[docs]    def lemmatize(self, tokens: list[str]) -> list[str]:
        """
        Transform tag method into custom method for lemmatizing
        tasks. Cf. ``tag`` method above.
        """
        return self.tag(tokens)


[docs]class DefaultLemmatizer(SequentialBackoffLemmatizer):
    """Lemmatizer that assigns the same lemma to every token. Useful as the final
    tagger in chain, e.g. to assign 'UNK' to all remaining unlemmatized tokens.
    :type lemma: str
    :param lemma: Lemma to assign to each token

    >>> default_lemmatizer = DefaultLemmatizer('UNK')
    >>> list(default_lemmatizer.lemmatize('arma virumque cano'.split()))
    [('arma', 'UNK'), ('virumque', 'UNK'), ('cano', 'UNK')]

    """

    def __init__(
        self,
        lemma: str = None,
        backoff: Optional[SequentialBackoffTagger] = None,
        verbose: bool = False,
    ):
        self.lemma = lemma
        SequentialBackoffLemmatizer.__init__(self, backoff=None, verbose=verbose)

[docs]    def choose_tag(self, tokens: list[str], index: int, history: list[str]):
        return self.lemma

    def __repr__(self):
        return f"<{type(self).__name__}: lemma={self.lemma}>"


[docs]class IdentityLemmatizer(SequentialBackoffLemmatizer):
    """Lemmatizer that returns a given token as its lemma. Like DefaultLemmatizer,
    useful as the final tagger in a chain, e.g. to assign a possible form to
    all remaining unlemmatized tokens, increasing the chance of a successful
    match.

    >>> identity_lemmatizer = IdentityLemmatizer()
    >>> list(identity_lemmatizer.lemmatize('arma virumque cano'.split()))
    [('arma', 'arma'), ('virumque', 'virumque'), ('cano', 'cano')]
    """

    def __init__(
        self, backoff: Optional[SequentialBackoffTagger] = None, verbose: bool = False
    ):
        SequentialBackoffLemmatizer.__init__(self, backoff=None, verbose=verbose)

[docs]    def choose_tag(self, tokens: list[str], index: int, history: list[str]):
        return tokens[index]

    def __repr__(self):
        return f"<{type(self).__name__}>"


[docs]class DictLemmatizer(SequentialBackoffLemmatizer):
    """Standalone version of 'model' function found in UnigramTagger; by
    defining as its own class, it is clearer that this lemmatizer is
    based on dictionary lookup and does not use training data."""

    def __init__(
        self,
        lemmas: dict[str, str],
        backoff: Optional[SequentialBackoffTagger] = None,
        source: str = None,
        verbose: bool = False,
    ):
        """
        Setup for DictLemmatizer().
        :type lemmas: dict
        :param lemmas: Dictionary with form {TOKEN: LEMMA} to be used
            as foor 'lookup'-style lemmatization
        :param backoff: Next lemmatizer in backoff chain.
        """
        SequentialBackoffLemmatizer.__init__(self, backoff, verbose=verbose)
        self.lemmas = lemmas
        self.source = source

[docs]    def choose_tag(self, tokens: list[str], index: int, history: list[str]):
        """
        Looks up token in ``lemmas`` dict and returns the corresponding
        value as lemma.
        :rtype: str
        :type tokens: list
        :param tokens: List of tokens to be lemmatized
        :type index: int
        :param index: Int with current token
        :type history: list
        :param history: List with tokens that have already been lemmatized; NOT USED
        """
        keys = self.lemmas.keys()
        if tokens[index] in keys:
            return self.lemmas[tokens[index]]

    def __repr__(self):
        if self.source:
            return f"<{type(self).__name__}: {self.source}>"
        else:
            return f"<{type(self).__name__}: {self.repr.repr(self.lemmas)}>"


[docs]class UnigramLemmatizer(SequentialBackoffLemmatizer, UnigramTagger):
    """Standalone version of 'train' function found in UnigramTagger; by
    defining as its own class, it is clearer that this lemmatizer is
    based on training data and not on dictionary.
    """

    def __init__(
        self,
        train=None,
        model=None,
        backoff: Optional[SequentialBackoffTagger] = None,
        source: str = None,
        cutoff=0,
        verbose: bool = False,
    ):
        """
        Setup for UnigramLemmatizer()
        """
        SequentialBackoffLemmatizer.__init__(self, backoff=None, verbose=verbose)
        UnigramTagger.__init__(self, train, model, backoff, cutoff)
        self.train = train
        self.source = source

    def __repr__(self):
        if self.source:
            return f"<{type(self).__name__}: {self.source}>"
        else:
            return f"<{type(self).__name__}: {self.repr.repr(self.train)}>"


[docs]class RegexpLemmatizer(SequentialBackoffLemmatizer, RegexpTagger):
    """Regular expression tagger, inheriting from
    ``SequentialBackoffLemmatizer`` and ``RegexpTagger``.
    """

    def __init__(self, regexps=None, source=None, backoff=None, verbose: bool = False):
        """Setup for RegexpLemmatizer()
        :type regexps: list
        :param regexps: List of tuples of form (PATTERN, REPLACEMENT)
        :param backoff: Next lemmatizer in backoff chain.
        """
        SequentialBackoffLemmatizer.__init__(self, backoff=None, verbose=verbose)
        RegexpTagger.__init__(self, regexps, backoff)
        self._regexs = regexps
        self.source = source

[docs]    def choose_tag(self, tokens: list[str], index: int, history: list[str]):
        """Use regular expressions for rules-based lemmatizing based on word endings;
        tokens are matched for patterns with the base kept as a group; an word ending
        replacement is added to the (base) group.
        :rtype: str
        :type tokens: list
        :param tokens: List of tokens to be lemmatized
        :type index: int
        :param index: Int with current token
        :type history: list
        :param history: List with tokens that have already been lemmatized; NOT USED
        """
        for pattern, replace in self._regexs:
            if re.search(pattern, tokens[index]):
                return re.sub(pattern, replace, tokens[index])

    def __repr__(self):
        if self.source:
            return f"<{type(self).__name__}: {self.source}>"
        else:
            return f"<{type(self).__name__}: {self.repr.repr(self._regexs)}>"
Source code for cltk.lemmatize.backoff

The Classical Language Toolkit

Navigation

Related Topics