Source code for cltk.tokenizers.akk

""" Code for word tokenization: Akkadian
"""

__author__ = [
    "Andrew Deloucas <adeloucas@g.harvard.edu>",
    "Patrick J. Burns <patrick@diyclassics.org>",
]
__license__ = "MIT License."

import re

from cltk.tokenizers.word import WordTokenizer


[docs]class AkkadianWordTokenizer(WordTokenizer):
    """
    Akkadian word and cuneiform tokenizer.
    """

[docs]    def tokenize(self, text: str):
        """
        Operates on a single line of text, returns all words in the line as a
        tuple in a list.

        input: "1. isz-pur-ram a-na"
        output: [("isz-pur-ram", "akkadian"), ("a-na", "akkadian")]

        :param: line: text string
        :return: list of tuples: (word, language)
        """
        beginning_underscore = "_[^_]+(?!_)$"
        # only match a string if it has a beginning underscore anywhere
        ending_underscore = "^(?<!_)[^_]+_"
        # only match a string if it has an ending underscore anywhere
        two_underscores = "_[^_]+_"
        # only match a string if it has two underscores

        words = text.split()
        # split the line on spaces ignoring the first split (which is the
        # line number)
        language = "akkadian"
        output_words = []
        for word in words:
            if re.search(two_underscores, word):
                # If the string has two underscores in it then the word is
                # in Sumerian while the neighboring words are in Akkadian.
                output_words.append((word, "sumerian"))
            elif re.search(beginning_underscore, word):
                # If the word has an initial underscore somewhere
                # but no other underscores than we're starting a block
                # of Sumerian.
                language = "sumerian"
                output_words.append((word, language))
            elif re.search(ending_underscore, word):
                # If the word has an ending underscore somewhere
                # but not other underscores than we're ending a block
                # of Sumerian.
                output_words.append((word, language))
                language = "akkadian"
            else:
                # If there are no underscore than we are continuing
                # whatever language we're currently in.
                output_words.append((word, language))
        return output_words

[docs]    def tokenize_sign(self, word: str):
        """
        Takes tuple (word, language) and splits the word up into individual
        sign tuples (sign, language) in a list.

        input: ("{gisz}isz-pur-ram", "akkadian")
        output: [("gisz", "determinative"), ("isz", "akkadian"),
        ("pur", "akkadian"), ("ram", "akkadian")]

        :param: tuple created by word_tokenizer2
        :return: list of tuples: (sign, function or language)
        """
        word_signs = []
        sign = ""
        language = word[1]
        determinative = False
        for char in word[0]:
            if determinative is True:
                if char == "}":
                    determinative = False
                    if len(sign) > 0:  # pylint: disable=len-as-condition
                        word_signs.append((sign, "determinative"))
                    sign = ""
                    language = word[1]
                    continue
                else:
                    sign += char
                    continue
            else:
                if language == "akkadian":
                    if char == "{":
                        if len(sign) > 0:  # pylint: disable=len-as-condition
                            word_signs.append((sign, language))
                        sign = ""
                        determinative = True
                        continue
                    elif char == "_":
                        if len(sign) > 0:  # pylint: disable=len-as-condition
                            word_signs.append((sign, language))
                        sign = ""
                        language = "sumerian"
                        continue
                    elif char == "-":
                        if len(sign) > 0:  # pylint: disable=len-as-condition
                            word_signs.append((sign, language))
                        sign = ""
                        language = word[1]  # or default word[1]?
                        continue
                    else:
                        sign += char
                elif language == "sumerian":
                    if char == "{":
                        if len(sign) > 0:  # pylint: disable=len-as-condition
                            word_signs.append((sign, language))
                        sign = ""
                        determinative = True
                        continue
                    elif char == "_":
                        if len(sign) > 0:  # pylint: disable=len-as-condition
                            word_signs.append((sign, language))
                        sign = ""
                        language = word[1]
                        continue
                    elif char == "-":
                        if len(sign) > 0:  # pylint: disable=len-as-condition
                            word_signs.append((sign, language))
                        sign = ""
                        language = word[1]
                        continue
                    else:
                        sign += char
        if len(sign) > 0:
            word_signs.append((sign, language))

        return word_signs

[docs]    @staticmethod
    def compute_indices(text: str, tokens):
        indices = []
        for i, token in enumerate(tokens):
            if 1 <= i:
                current_index = indices[-1] + len(tokens[i - 1][0])
                indices.append(current_index + text[current_index:].find(token[0]))
            else:
                indices.append(text.find(token[0]))
        return indices
Source code for cltk.tokenizers.akk

The Classical Language Toolkit

Navigation

Related Topics