Source code for myanmar.converter

# converter.py - converter module
# coding: utf-8
# The MIT License (MIT)
# Copyright (c) 2018 Thura Hlaing

# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:

# The above copyright notice and this permission notice shall be included in
# all copies or substantial portions of the Software.

# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
# IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,
# DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
# OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE
# OR OTHER DEALINGS IN THE SOFTWARE.

import sys

from myanmar import language
from myanmar import encodings


[docs]def get_supported_encodings():
    """
    Get a list of encodings supported by ``converter`` module.

    >>> get_supported_encodings()
    ['unicode', 'zawgyi', 'wininnwa']
    """
    return ['unicode', 'zawgyi', 'wininnwa']


encoders = {
    "unicode": encodings.UnicodeEncoding(),
    "zawgyi": encodings.ZawgyiEncoding(),
    "wininnwa": encodings.WininnwaEncoding(),
}


[docs]def convert(text, fromenc, toenc):
    """
    Convert text in ``fromenc`` encoding to ``toenc`` encoding.

    >>> convert('အကျိုးတရား', 'unicode', 'zawgyi')
    'အက်ိဳးတရား'
    >>> convert('ဉာဏ္ႀကီးရွင္', 'zawgyi', 'unicode')
    'ဉာဏ်ကြီးရှင်'
    >>> convert('&[ef;', 'wininnwa', 'unicode')
    'ရဟန်း'
    """
    if fromenc not in encoders:
        raise NotImplementedError("Unsupported encoding: %s" % fromenc)

    if toenc not in encoders:
        raise NotImplementedError("Unsupported encoding: %s" % toenc)

    fromencoder = encoders[fromenc]
    toencoder = encoders[toenc]
    iterator = language.MorphoSyllableBreak(text=text, encoding=fromencoder)

    otext = ""
    for syllable in iterator:
        full_syllable = syllable['syllable']
        if len(syllable) == 1:
            # unmatched text, no need to convert
            otext += full_syllable
            continue

        if full_syllable in fromencoder.reverse_table:
            # Direct mapping
            key = fromencoder.reverse_table[full_syllable]
            key = key[:key.find('_')] if '_' in key else key  # remove _part
            otext += toencoder.table[key]
            continue

        otext += convert_syllable(syllable, fromenc, toenc)

    return otext


def convert_syllable(syllable, fromenc, toenc):
    fromencoder = encoders[fromenc]
    toencoder = encoders[toenc]

    for part in syllable.keys():
        if part == 'syllable': continue  # noqa skip complete syllable

        key = fromencoder.reverse_table[syllable[part]]
        key = key[:key.find('_')] if '_' in key else key  # remove _part

        if part == "consonant":
            if key == "na":
                key += choose_na_variant(syllable)
            if key == "ra":
                key += choose_ra_variant(syllable)
            if key == "nnya":
                key += choose_nnya_variant(syllable)
        elif part == "yapin":
            key += choose_yapin_variant(syllable)
        elif part == "yayit":
            key += choose_yayit_variant(syllable)
        elif part == "uVowel":
            key += choose_uvowel_variant(syllable)
        elif part == "aaVowel":
            key += choose_aavowel_variant(syllable)
        elif part == "dotBelow":
            key += choose_dot_below_variant(syllable)

        syllable[part] = key

    if 'uVowel' in syllable and 'hatoh' in syllable:
        syllable['hatoh'] = syllable['hatoh'] + '-' + syllable['uVowel']
        del syllable['uVowel']
    if 'wasway' in syllable and 'hatoh' in syllable:
        syllable['wasway'] = syllable['wasway'] + '-' + syllable['hatoh']
        del syllable['hatoh']

    osyllable = ""
    # collect codepoints in syllable, in correct syllable order
    for part in toencoder.syllable_parts:
        if part not in syllable: continue  # noqa

        try:
            key = syllable[part]
            osyllable += toencoder.table[key]
        except Exception:
            print(key, syllable)

    return osyllable


def is_wide_consonant(char):
    WIDE_CONSONANTS = [
        "ka", "gha", "ca", "cha", "nya", "nna", "ta", "tha", "bha", "ya", "la",
        "sa", "ha", "a", "greatSa"
    ]
    return char in WIDE_CONSONANTS


def is_lower_consonant(char):
    LOWER_CONSONANTS = [
        "nya",
        "na",
        "ra",  # ... more
    ]
    return char in LOWER_CONSONANTS


def has_lower_marks(syllable, filters=[]):
    MAKRS = ["stack", "wasway", "yapin", "yayit", "hatoh", "uVowel"]
    for mark in [m for m in MAKRS if m not in filters]:
        if mark in syllable:
            return True
    return False


def has_upper_marks(syllable, filters=[]):
    MAKRS = ["kinzi", "yapin", "iVowel", "aiVowel", "anusvara"]
    for mark in [m for m in MAKRS if m not in filters]:
        if mark in syllable:
            return True
    return False


def choose_ra_variant(syllable):
    key = "_alt" if has_lower_marks(syllable, ["hatoh"]) else ""
    return key


def choose_na_variant(syllable):
    key = "_alt" if has_lower_marks(syllable) else ""
    return key


def choose_nnya_variant(syllable):
    key = "_alt" if has_lower_marks(syllable) else ""
    return key


def choose_uvowel_variant(syllable):
    key = "_tall" if has_lower_marks(syllable, ["uVowel", "hatoh"]) else ""
    return key


def choose_aavowel_variant(syllable):
    _C = ["kha", "gha", "nga", "da", "dha", "pa", "wa"]

    key = ""
    if syllable['consonant'] in _C:
        for c in ['yapin', 'yayit', 'wasway', 'hatoh']:
            if c in syllable:
                break
        else:
            key += '_tall'

    return key


def choose_yayit_variant(syllable):
    key = "_wide" if is_wide_consonant(syllable['consonant']) else "_narrow"
    key += "_lower" if has_lower_marks(syllable, ["yayit", "uVowel"]) else ""
    key += "_upper" if has_upper_marks(syllable, ["yayit"]) else ""
    return key


def choose_yapin_variant(syllable):
    key = "_alt" if has_lower_marks(syllable, ["yapin", "uVowel"]) else ""
    return key


def choose_dot_below_variant(syllable):
    key = ""

    if syllable['consonant'] == "na":
        key += "_alt"
    elif syllable['consonant'] == "ra":
        key += "_alt_alt"
    elif "uVowel" in syllable:
        key += "_alt_alt" if 'yayit' in syllable else '_alt'
    elif "yapin" in syllable:
        key += "_alt"
    elif "wasway" in syllable:
        key += "_alt_alt"

    return key


def main():
    import argparse
    import fileinput

    parser = argparse.ArgumentParser(
        description='Convert between various Myanmar encodings'
    )
    parser.add_argument(
        '-f',
        '--from',
        dest='fro',
        action='store',
        required=True,
        help='convert characters from ENCODING',
        metavar="ENCODING",
    )
    parser.add_argument(
        '-t',
        '--to',
        dest='to',
        action='store',
        required=True,
        help='convert characters to ENCODING',
        metavar="ENCODING",
    )
    parser.add_argument(
        'files',
        metavar='FILE',
        nargs='*',
        help='files to convert, if empty, stdin is used'
    )

    args = parser.parse_args()
    if args.fro not in get_supported_encodings():
        print(
            "%s is not a supported encoding. Should be any of %s." %
            (args.fro, get_supported_encodings())
        )
        sys.exit(-1)

    if args.to not in get_supported_encodings():
        print(
            "%s is not a supported encoding. Should be any of %s." %
            (args.to, get_supported_encodings())
        )
        sys.exit(-1)

    if args.fro == args.to:
        print("from encoding must not be the same as to encoding.")
        sys.exit(-1)

    for line in fileinput.input(files=args.files):
        print(convert(line, args.fro, args.to), end='')


if __name__ == "__main__":
    main()