Source code for myanmar.converter

# converter.py - converter module
# coding: utf-8
# The MIT License (MIT)
# Copyright (c) 2018 Thura Hlaing

# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:

# The above copyright notice and this permission notice shall be included in
# all copies or substantial portions of the Software.

# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
# IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,
# DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
# OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE
# OR OTHER DEALINGS IN THE SOFTWARE.

import sys

from myanmar import language
from myanmar import encodings


[docs]def get_supported_encodings(): """ Get a list of encodings supported by ``converter`` module. >>> get_supported_encodings() ['unicode', 'zawgyi', 'wininnwa'] """ return ['unicode', 'zawgyi', 'wininnwa']
encoders = { "unicode": encodings.UnicodeEncoding(), "zawgyi": encodings.ZawgyiEncoding(), "wininnwa": encodings.WininnwaEncoding(), }
[docs]def convert(text, fromenc, toenc): """ Convert text in ``fromenc`` encoding to ``toenc`` encoding. >>> convert('အကျိုးတရား', 'unicode', 'zawgyi') 'အက်ိဳးတရား' >>> convert('ဉာဏ္ႀကီးရွင္', 'zawgyi', 'unicode') 'ဉာဏ်ကြီးရှင်' >>> convert('&[ef;', 'wininnwa', 'unicode') 'ရဟန်း' """ if fromenc not in encoders: raise NotImplementedError("Unsupported encoding: %s" % fromenc) if toenc not in encoders: raise NotImplementedError("Unsupported encoding: %s" % toenc) fromencoder = encoders[fromenc] toencoder = encoders[toenc] iterator = language.MorphoSyllableBreak(text=text, encoding=fromencoder) otext = "" for syllable in iterator: full_syllable = syllable['syllable'] if len(syllable) == 1: # unmatched text, no need to convert otext += full_syllable continue if full_syllable in fromencoder.reverse_table: # Direct mapping key = fromencoder.reverse_table[full_syllable] key = key[:key.find('_')] if '_' in key else key # remove _part otext += toencoder.table[key] continue otext += convert_syllable(syllable, fromenc, toenc) return otext
def convert_syllable(syllable, fromenc, toenc): fromencoder = encoders[fromenc] toencoder = encoders[toenc] for part in syllable.keys(): if part == 'syllable': continue # noqa skip complete syllable key = fromencoder.reverse_table[syllable[part]] key = key[:key.find('_')] if '_' in key else key # remove _part if part == "consonant": if key == "na": key += choose_na_variant(syllable) if key == "ra": key += choose_ra_variant(syllable) if key == "nnya": key += choose_nnya_variant(syllable) elif part == "yapin": key += choose_yapin_variant(syllable) elif part == "yayit": key += choose_yayit_variant(syllable) elif part == "uVowel": key += choose_uvowel_variant(syllable) elif part == "aaVowel": key += choose_aavowel_variant(syllable) elif part == "dotBelow": key += choose_dot_below_variant(syllable) syllable[part] = key if 'uVowel' in syllable and 'hatoh' in syllable: syllable['hatoh'] = syllable['hatoh'] + '-' + syllable['uVowel'] del syllable['uVowel'] if 'wasway' in syllable and 'hatoh' in syllable: syllable['wasway'] = syllable['wasway'] + '-' + syllable['hatoh'] del syllable['hatoh'] osyllable = "" # collect codepoints in syllable, in correct syllable order for part in toencoder.syllable_parts: if part not in syllable: continue # noqa try: key = syllable[part] osyllable += toencoder.table[key] except Exception: print(key, syllable) return osyllable def is_wide_consonant(char): WIDE_CONSONANTS = [ "ka", "gha", "ca", "cha", "nya", "nna", "ta", "tha", "bha", "ya", "la", "sa", "ha", "a", "greatSa" ] return char in WIDE_CONSONANTS def is_lower_consonant(char): LOWER_CONSONANTS = [ "nya", "na", "ra", # ... more ] return char in LOWER_CONSONANTS def has_lower_marks(syllable, filters=[]): MAKRS = ["stack", "wasway", "yapin", "yayit", "hatoh", "uVowel"] for mark in [m for m in MAKRS if m not in filters]: if mark in syllable: return True return False def has_upper_marks(syllable, filters=[]): MAKRS = ["kinzi", "yapin", "iVowel", "aiVowel", "anusvara"] for mark in [m for m in MAKRS if m not in filters]: if mark in syllable: return True return False def choose_ra_variant(syllable): key = "_alt" if has_lower_marks(syllable, ["hatoh"]) else "" return key def choose_na_variant(syllable): key = "_alt" if has_lower_marks(syllable) else "" return key def choose_nnya_variant(syllable): key = "_alt" if has_lower_marks(syllable) else "" return key def choose_uvowel_variant(syllable): key = "_tall" if has_lower_marks(syllable, ["uVowel", "hatoh"]) else "" return key def choose_aavowel_variant(syllable): _C = ["kha", "gha", "nga", "da", "dha", "pa", "wa"] key = "" if syllable['consonant'] in _C: for c in ['yapin', 'yayit', 'wasway', 'hatoh']: if c in syllable: break else: key += '_tall' return key def choose_yayit_variant(syllable): key = "_wide" if is_wide_consonant(syllable['consonant']) else "_narrow" key += "_lower" if has_lower_marks(syllable, ["yayit", "uVowel"]) else "" key += "_upper" if has_upper_marks(syllable, ["yayit"]) else "" return key def choose_yapin_variant(syllable): key = "_alt" if has_lower_marks(syllable, ["yapin", "uVowel"]) else "" return key def choose_dot_below_variant(syllable): key = "" if syllable['consonant'] == "na": key += "_alt" elif syllable['consonant'] == "ra": key += "_alt_alt" elif "uVowel" in syllable: key += "_alt_alt" if 'yayit' in syllable else '_alt' elif "yapin" in syllable: key += "_alt" elif "wasway" in syllable: key += "_alt_alt" return key def main(): import argparse import fileinput parser = argparse.ArgumentParser( description='Convert between various Myanmar encodings' ) parser.add_argument( '-f', '--from', dest='fro', action='store', required=True, help='convert characters from ENCODING', metavar="ENCODING", ) parser.add_argument( '-t', '--to', dest='to', action='store', required=True, help='convert characters to ENCODING', metavar="ENCODING", ) parser.add_argument( 'files', metavar='FILE', nargs='*', help='files to convert, if empty, stdin is used' ) args = parser.parse_args() if args.fro not in get_supported_encodings(): print( "%s is not a supported encoding. Should be any of %s." % (args.fro, get_supported_encodings()) ) sys.exit(-1) if args.to not in get_supported_encodings(): print( "%s is not a supported encoding. Should be any of %s." % (args.to, get_supported_encodings()) ) sys.exit(-1) if args.fro == args.to: print("from encoding must not be the same as to encoding.") sys.exit(-1) for line in fileinput.input(files=args.files): print(convert(line, args.fro, args.to), end='') if __name__ == "__main__": main()