Source code for cltk.tokenizers.word
"""Language-specific word tokenizers. Primary purpose is
to handle enclitics.
"""
__author__ = [
"Patrick J. Burns <patrick@diyclassics.org>",
"Kyle P. Johnson <kyle@kyle-p-johnson.com>",
"Natasha Voake <natashavoake@gmail.com>",
"Clément Besnier <clem@clementbesnier.fr>",
"Andrew Deloucas <adeloucas@g.harvard.edu>",
"Todd Cook <todd.g.cook@gmail.com>",
]
__license__ = "MIT License. See LICENSE."
import logging
import re
from abc import abstractmethod
from nltk.tokenize import TreebankWordTokenizer
from nltk.tokenize.punkt import PunktParameters, PunktSentenceTokenizer
[docs]class WordTokenizer:
"""Base class for word tokenizers"""
[docs] @abstractmethod
def tokenize(self, text: str, model=None):
"""
Create a list of tokens from a string.
This method should be overridden by subclasses of WordTokenizer.
"""
pass
[docs] @abstractmethod
def tokenize_sign(self, text: str, model=None):
"""
Create a list of tokens from a string, for cuneiform signs..
This method should be overridden by subclasses of WordTokenizer.
"""
pass
[docs] @staticmethod
def compute_indices(text: str, tokens):
indices = []
for i, token in enumerate(tokens):
if 1 <= i:
current_index = indices[-1] + len(tokens[i - 1])
indices.append(current_index + text[current_index:].find(token))
else:
indices.append(text.find(token))
return indices
[docs]class PunktWordTokenizer(WordTokenizer):
"""Class for punkt word tokenization"""
def __init__(self, sent_tokenizer=None):
"""
:param language : language for sentences tokenization
:type language: str
"""
if sent_tokenizer:
self.sent_tokenizer = sent_tokenizer()
else:
punkt_param = PunktParameters()
self.sent_tokenizer = PunktSentenceTokenizer(punkt_param)
[docs] def tokenize(self, text: str):
"""
:rtype: list
:param text: text to be tokenized into sentences
:type text: str
"""
sents = self.sent_tokenizer.tokenize(text)
tokenizer = TreebankWordTokenizer()
return [item for sublist in tokenizer.tokenize_sents(sents) for item in sublist]
[docs]class RegexWordTokenizer(WordTokenizer):
"""Class for regex-based word tokenization"""
def __init__(self, patterns: list[str] = None):
"""
:param language : language for sentences tokenization
:type language: str
:param patterns: regex patterns for word tokenization
:type patterns: list of strings
"""
self.patterns = patterns
[docs] def tokenize(self, text: str):
"""
:rtype: list
:param text: text to be tokenized into sentences
:type text: str
:param model: tokenizer object to used # Should be in init?
:type model
"""
for pattern in self.patterns:
text = re.sub(pattern[0], pattern[1], text)
return text.split()
[docs]class CLTKTreebankWordTokenizer(TreebankWordTokenizer):
[docs] @staticmethod
def compute_indices(text: str, tokens):
indices = []
for i, token in enumerate(tokens):
if 1 <= i:
current_index = indices[-1] + len(tokens[i - 1])
indices.append(current_index + text[current_index:].find(token))
else:
indices.append(text.find(token))
return indices