Source code for cltk.text.akk

import re
from re import Match
from typing import Optional
from unicodedata import normalize

__author__ = ["Andrew Deloucas <ADeloucas@g.harvard.com>"]
__license__ = "MIT License. See LICENSE."

VOWELS: str = "aeiouAEIOU"
TITTLES: dict[str, str] = {
    r"s,": chr(0x1E63),
    r"sz": chr(0x0161),
    r"t,": chr(0x1E6D),
    r"'": chr(0x02BE),
    r"S,": chr(0x1E62),
    r"SZ": chr(0x0160),
    r"T,": chr(0x1E6C),
}


[docs]def _convert_consonant(sign: str) -> str:
    """Uses dictionary to replace ATF convention for unicode characters.

    >>> signs = ["as,", "S,ATU", "tet,", "T,et", "sza", "ASZ"]
    >>> [_convert_consonant(s) for s in signs]
    ['aṣ', 'ṢATU', 'teṭ', 'Ṭet', 'ša', 'AŠ']
    """

    for key in TITTLES:
        sign = sign.replace(key, TITTLES[key])
    return sign


[docs]def _convert_number_to_subscript(num: int) -> str:
    """Converts number into subscript.

    >>> signs = ["a", "a1", "be2", "bad3", "buru14"]
    >>> [_get_number_from_sign(s)[1] for s in signs]
    [0, 1, 2, 3, 14]
    """

    subscript: str = ""
    for character in str(num):
        subscript += chr(0x2080 + int(character))
    return subscript


[docs]def _get_number_from_sign(sign: str) -> tuple[str, int]:
    """Captures numbers after sign for __convert_num__.

    input = ["a", "a1", "be2", "bad3", "buru14"]
    output = [0, 1, 2, 3, 14]

    :param sign: string
    :return: string, integer
    """

    match: Match[str] = re.search(r"\d{1,3}$", sign)
    number: Optional[int] = None
    if match is None:
        number = 0
    else:
        number = match[0]
    return sign, int(number)


# noinspection PyUnboundLocalVariable
[docs]class ATFConverter:  # pylint: disable=too-few-public-methods
    """Class to convert tokens to unicode.

    Transliterates ATF data from CDLI into readable unicode.
        sz = š
        s, = ṣ
        t, = ṭ
        ' = ʾ
        Sign values for 2-3 take accent aigu and accent grave standards,
        otherwise signs are printed as subscript.

    For in depth reading on ATF-formatting for CDLI and ORACC:
        Oracc ATF Primer = http://oracc.museum.upenn.edu/doc/help/editinginatf/
        primer/index.html
        ATF Structure = http://oracc.museum.upenn.edu/doc/help/editinginatf/
        primer/structuretutorial/index.html
        ATF Inline = http://oracc.museum.upenn.edu/doc/help/editinginatf/
        primer/inlinetutorial/index.html
    """

    def __init__(self, two_three: bool = True):
        """
        :param two_three: turns on or off accent marking.
        """

        self.two_three: bool = two_three

[docs]    def _convert_num(self, sign: str) -> str:
        """
        Converts number registered in get_number_from_sign.
        """

        # Check if there's a number at the end
        new_sign, num = _get_number_from_sign(sign)
        if num < 2:  # "ab" -> "ab"
            return new_sign.replace(str(num), _convert_number_to_subscript(num))
        if num > 3:  # "buru14" -> "buru₁₄"
            return new_sign.replace(str(num), _convert_number_to_subscript(num))
        if self.two_three:  # pylint: disable=no-else-return
            return new_sign.replace(str(num), _convert_number_to_subscript(num))
        else:
            # "bad3" -> "bàd"
            for i, character in enumerate(new_sign):
                new_vowel = ""
                if character in VOWELS:
                    if num == 2:
                        # noinspection PyUnusedLocal
                        new_vowel = character + chr(0x0301)
                    elif num == 3:
                        new_vowel = character + chr(0x0300)
                    break
            return (
                new_sign[:i]
                + normalize("NFC", new_vowel)
                + new_sign[i + 1 :].replace(str(num), "")
            )

[docs]    def process(self, tokens: list[str]) -> list[str]:
        """
        Expects a list of tokens, will return the list converted from ATF
        format to print-format.

        >>> c = ATFConverter()
        >>> c.process(["a", "a2", "a3", "geme2", "bad3", "buru14"])
        ['a', 'a₂', 'a₃', 'geme₂', 'bad₃', 'buru₁₄']
        """

        return [self._convert_num(_convert_consonant(token)) for token in tokens]
Source code for cltk.text.akk

The Classical Language Toolkit

Navigation

Related Topics