# xcp.py: XeTeX Chinese Proprocessor 0.2.2
# Changelog:
#   0.2.2: prevent unnecessary processing with '@'
#          before '{'         
#   0.2.1: minor upgrade to fix comment handling

import codecs, sys

LINE_END = '\n'
MARK_START = '\\begin{document}'
MARK_END = '\\end{document}'

VERB_START = '\\begin{verbatim}'
VERB_END = '\\end{verbatim}'

try:
    f = codecs.open(sys.argv[1], encoding='utf-8')
except IOError:
    print "file", sys.argv[1], "no exist"
    sys.exit()

start = 0
stdout = sys.stdout

def output(x):
    stdout.write(x.encode('utf-8'))

def output_line_end():
    output(LINE_END)

def isch(x):
    # Punct & Radicals
    if x >= 0x2e80 and x <= 0x33ff:
        return 1

    # Fullwidth Latin Characters
    if x >= 0xff00 and x <= 0xffef:
        return 1

    # CJK Unified Ideographs &
    # CJK Unified Ideographs Extension A
    if x >= 0x4e00 and x <= 0x9fbb:
        return 1
    # CJK Compatibility Ideographs
    if x >= 0xf900 and x <= 0xfad9:
        return 1

    # CJK Unified Ideographs Extension B
    if x >= 0x20000 and x <= 0x2a6d6:
        return 1

    # CJK Compatibility Supplement
    if x >= 0x2f8000 and x <= 0x2fa1d:
        return 1

    return 0

ch = 0
b = 0
lc = 0
comment = 0
enable = 1
level = 0
verbatim = 0

for line in f:
    if verbatim == 0 and line.find(MARK_END) != -1:
        output_line_end()
        start = 0

    if start > 0:
        if line.find(VERB_START) != -1:
            verbatim = 1
        
        if line.find(VERB_END) != -1:
            verbatim = 0

        # if last \zh{ has not yet been closed
        if ch == 1 and (isch(ord(line[0])) == 0 or line[0] == '\n'):
            ch = 0
            output('}')

        if isch(lc) and isch(ord(line[0])):
            output('%')

        if b == 0:
            b = 1
        else:
            output_line_end()
     
        # if line.find('\\XeTeX') != -1:
        #    output(line[:-1])
        #    continue

        if line[0] == LINE_END:
            lc = ord(LINE_END)

        line_len = len(line)
        for i in range(line_len):
            c = line[i]
            if c == '%' and lc != ord('\\'):
                comment = 1

            if c == LINE_END:
                comment = 0
                break
 
            n = ord(c)
            if comment == 1:
                output(c)
                lc = n
                continue

            if c == '@' and i + 1 < line_len and line[i + 1] == '{':
                enable = 0
                level = 0
                continue

            # match parens
            if enable == 0 and c == '{':
                level += 1

            if enable == 0 and c == '}':
                level -= 1

            if enable == 0 and level == 0:
                enable = 1

            # output directly without processing
            if enable == 0:
                output(c)
                lc = n
                continue
           
            # chinese starts
            if ch == 0 and isch(n) == 1:
                output('\\zh{')
                ch = 1

            elif ch == 1 and isch(n) == 0:
                output('}')
                ch = 0
            lc = n
            output(c)
    else:
        output(line)

    if line.find(MARK_START) != -1:
        start = 1

