Source code for cltk.tokenizers.akk

""" Code for word tokenization: Akkadian
"""

__author__ = [
    "Andrew Deloucas <adeloucas@g.harvard.edu>",
    "Patrick J. Burns <patrick@diyclassics.org>",
]
__license__ = "MIT License."

import re

from cltk.tokenizers.word import WordTokenizer


[docs]class AkkadianWordTokenizer(WordTokenizer): """ Akkadian word and cuneiform tokenizer. """
[docs] def tokenize(self, text: str): """ Operates on a single line of text, returns all words in the line as a tuple in a list. input: "1. isz-pur-ram a-na" output: [("isz-pur-ram", "akkadian"), ("a-na", "akkadian")] :param: line: text string :return: list of tuples: (word, language) """ beginning_underscore = "_[^_]+(?!_)$" # only match a string if it has a beginning underscore anywhere ending_underscore = "^(?<!_)[^_]+_" # only match a string if it has an ending underscore anywhere two_underscores = "_[^_]+_" # only match a string if it has two underscores words = text.split() # split the line on spaces ignoring the first split (which is the # line number) language = "akkadian" output_words = [] for word in words: if re.search(two_underscores, word): # If the string has two underscores in it then the word is # in Sumerian while the neighboring words are in Akkadian. output_words.append((word, "sumerian")) elif re.search(beginning_underscore, word): # If the word has an initial underscore somewhere # but no other underscores than we're starting a block # of Sumerian. language = "sumerian" output_words.append((word, language)) elif re.search(ending_underscore, word): # If the word has an ending underscore somewhere # but not other underscores than we're ending a block # of Sumerian. output_words.append((word, language)) language = "akkadian" else: # If there are no underscore than we are continuing # whatever language we're currently in. output_words.append((word, language)) return output_words
[docs] def tokenize_sign(self, word: str): """ Takes tuple (word, language) and splits the word up into individual sign tuples (sign, language) in a list. input: ("{gisz}isz-pur-ram", "akkadian") output: [("gisz", "determinative"), ("isz", "akkadian"), ("pur", "akkadian"), ("ram", "akkadian")] :param: tuple created by word_tokenizer2 :return: list of tuples: (sign, function or language) """ word_signs = [] sign = "" language = word[1] determinative = False for char in word[0]: if determinative is True: if char == "}": determinative = False if len(sign) > 0: # pylint: disable=len-as-condition word_signs.append((sign, "determinative")) sign = "" language = word[1] continue else: sign += char continue else: if language == "akkadian": if char == "{": if len(sign) > 0: # pylint: disable=len-as-condition word_signs.append((sign, language)) sign = "" determinative = True continue elif char == "_": if len(sign) > 0: # pylint: disable=len-as-condition word_signs.append((sign, language)) sign = "" language = "sumerian" continue elif char == "-": if len(sign) > 0: # pylint: disable=len-as-condition word_signs.append((sign, language)) sign = "" language = word[1] # or default word[1]? continue else: sign += char elif language == "sumerian": if char == "{": if len(sign) > 0: # pylint: disable=len-as-condition word_signs.append((sign, language)) sign = "" determinative = True continue elif char == "_": if len(sign) > 0: # pylint: disable=len-as-condition word_signs.append((sign, language)) sign = "" language = word[1] continue elif char == "-": if len(sign) > 0: # pylint: disable=len-as-condition word_signs.append((sign, language)) sign = "" language = word[1] continue else: sign += char if len(sign) > 0: word_signs.append((sign, language)) return word_signs
[docs] @staticmethod def compute_indices(text: str, tokens): indices = [] for i, token in enumerate(tokens): if 1 <= i: current_index = indices[-1] + len(tokens[i - 1][0]) indices.append(current_index + text[current_index:].find(token[0])) else: indices.append(text.find(token[0])) return indices