Source code for cltk.tokenizers.utils

""" Tokenization utilities

TODO: KJ consider moving to ``scripts`` dir.
"""

__author__ = ["Patrick J. Burns <patrick@diyclassics.org>"]
__license__ = "MIT License."

import inspect
import pickle
from abc import abstractmethod
from typing import Any, Generator, Set

from nltk.tokenize.punkt import PunktLanguageVars, PunktSentenceTokenizer, PunktTrainer


[docs]class SentenceTokenizerTrainer: """Train sentences tokenizer""" def __init__( self, language: str = None, punctuation: list[str] = None, strict: bool = False, strict_punctuation: list[str] = None, abbreviations: list[str] = None, ): """Initialize stoplist builder with option for language specific parameters :type language: str :param language: text from which to build the stoplist :type punctuation: list :param punctuation: list of punctuation used to train sentences tokenizer :type strict: bool :param strict: option for including additional punctuation for tokenizer :type strict: list :param strict: list of additional punctuation used to train sentences tokenizer if strict is used :type abbreviations: list :param abbreviations: list of abbreviations used to train sentences tokenizer """ if language: self.language = language.lower() self.strict = strict self.punctuation = punctuation self.strict_punctuation = strict_punctuation self.abbreviations = abbreviations
[docs] def train_sentence_tokenizer(self, text: str): """ Train sentences tokenizer. """ language_punkt_vars = PunktLanguageVars # Set punctuation if self.punctuation: if self.strict: language_punkt_vars.sent_end_chars = ( self.punctuation + self.strict_punctuation ) else: language_punkt_vars.sent_end_chars = self.punctuation # Set abbreviations trainer = PunktTrainer(text, language_punkt_vars) trainer.INCLUDE_ALL_COLLOCS = True trainer.INCLUDE_ABBREV_COLLOCS = True tokenizer = PunktSentenceTokenizer(trainer.get_params()) if self.abbreviations: for abbreviation in self.abbreviations: tokenizer._params.abbrev_types.add(abbreviation) return tokenizer
[docs] def pickle_sentence_tokenizer(self, filename: str, tokenizer): # Dump pickled tokenizer with open(filename, "wb") as f: pickle.dump(tokenizer, f)