Source code for cltk.tokenizers.utils

""" Tokenization utilities

TODO: KJ consider moving to ``scripts`` dir.
"""

__author__ = ["Patrick J. Burns <patrick@diyclassics.org>"]
__license__ = "MIT License."

import inspect
import pickle
from abc import abstractmethod
from typing import Any, Generator, Set

from nltk.tokenize.punkt import PunktLanguageVars, PunktSentenceTokenizer, PunktTrainer


[docs]class SentenceTokenizerTrainer:
    """Train sentences tokenizer"""

    def __init__(
        self,
        language: str = None,
        punctuation: list[str] = None,
        strict: bool = False,
        strict_punctuation: list[str] = None,
        abbreviations: list[str] = None,
    ):
        """Initialize stoplist builder with option for language specific parameters
        :type language: str
        :param language: text from which to build the stoplist
        :type punctuation: list
        :param punctuation: list of punctuation used to train sentences tokenizer
        :type strict: bool
        :param strict: option for including additional punctuation for tokenizer
        :type strict: list
        :param strict: list of additional punctuation used to train sentences tokenizer if strict is used
        :type abbreviations: list
        :param abbreviations: list of abbreviations used to train sentences tokenizer
        """
        if language:
            self.language = language.lower()

        self.strict = strict
        self.punctuation = punctuation
        self.strict_punctuation = strict_punctuation
        self.abbreviations = abbreviations

[docs]    def train_sentence_tokenizer(self, text: str):
        """
        Train sentences tokenizer.
        """
        language_punkt_vars = PunktLanguageVars

        # Set punctuation
        if self.punctuation:
            if self.strict:
                language_punkt_vars.sent_end_chars = (
                    self.punctuation + self.strict_punctuation
                )
            else:
                language_punkt_vars.sent_end_chars = self.punctuation

        # Set abbreviations
        trainer = PunktTrainer(text, language_punkt_vars)
        trainer.INCLUDE_ALL_COLLOCS = True
        trainer.INCLUDE_ABBREV_COLLOCS = True

        tokenizer = PunktSentenceTokenizer(trainer.get_params())

        if self.abbreviations:
            for abbreviation in self.abbreviations:
                tokenizer._params.abbrev_types.add(abbreviation)

        return tokenizer

[docs]    def pickle_sentence_tokenizer(self, filename: str, tokenizer):
        # Dump pickled tokenizer
        with open(filename, "wb") as f:
            pickle.dump(tokenizer, f)
Source code for cltk.tokenizers.utils

The Classical Language Toolkit

Navigation

Related Topics