# This script is for faking the recognition by providing the transcription.
# It produces fake lattices from these transcriptions and puts them into a specified directory

import re
import getopt
import sys
import base64
import os
import os.path

def parse_input(input):
    rDoc = re.compile(ur'<(DOC|doc) id="([^"]*)">')
    rDoc2 = re.compile(ur"<(DOC|doc) id='([^']*)'>")
    rEndDoc = re.compile(ur'</(DOC|doc)>')
    rS = re.compile(ur'<[Ss]>')
    rEndS = re.compile(ur'</[Ss]>')

    uri = None

    for line in input:
        line = line.rstrip()
        m = rDoc.match(line)
        if m == None:
            m = rDoc2.match(line)

        if m != None:
            uri = m.group(2)
            yield ("begindoc", uri)
            continue
        
        m = rEndDoc.match(line)
        if m != None:
            yield ("enddoc", uri)
            continue

        m = rS.match(line)
        if m != None:
            yield ("begins", None)
            continue
        
        m = rEndS.match(line)
        if m != None:
            yield ("ends", None)
            continue

        yield ("token", line)

def iter_documents(parsed_input):
    tokens = []
    ss = []
    uri = None
    for event, data in parsed_input:
        if event == "enddoc":
            if len(tokens) > 0:
                # if no explicit s, 
                ss.append (tokens)
                tokens = []
            yield (uri, ss)

            ss = []
            tokens = []
            uri = None
        elif event == "begindoc":
            if len(tokens) > 0:
                ss.append(tokens)
            if len(ss) > 0:
                if uri != None:
                    yield (uri, ss)

            tokens = []
            ss = []
            uri = data
        elif event == "begins":
            if len(tokens) > 0:
                ss.append (tokens)

            tokens = []
        elif event == "ends":
            ss.append (tokens)
            tokens = []
        elif event == "token":
            tokens.append (data)

def write_sentence_lattice(of, tokens):
    of.write("N=%d L=%d\n" % (len(tokens) + 4, len(tokens) + 3))
    of.write("I=0 t=0 W=!NULL\n")
    of.write("I=1 t=0 W=<s>\n")

    I = 2
    t = 0.0

    for token in tokens:
        of.write("I=%d t=%f W=%s\n" % (I ,t, token.upper()))
        I += 1
        t += 0.5 

    of.write("I=%d t=%f W=</s>\n" % (I, t))
    of.write("I=%d t=%f W=!NULL\n" % (I + 1, t))

    of.write("J=0 S=0 E=1\n")
    
    J = 1
    
    for token in tokens:
        of.write("J=%d S=%d E=%d l=%f a=%f\n" % (J, J, J+1, 0.0, 0.0))
        J += 1

    of.write("J=%d S=%d E=%d\n" % (J, J-1, J))

def write_document_lattices(directory, prefix, ss):
    i = 0
    for tokens in ss:
        i += 50

        startn = i
        i += len(tokens) * 200 # just some magic value, I think it is sample number, or maybe milliseconds or something... 
        endn = i

        filename = prefix + "_%09d_%09d.lat" % (startn, endn)
        
        f = open(os.path.join(directory, filename), "w")
        write_sentence_lattice(f, tokens)
        f.close()

def encodeUri(uri):
    r = base64.b64encode(uri)
    return r.replace("_", "(")

def write_documents_lattices(directory, docs):
    for uri, ss in docs:
        prefix = encodeUri(uri)
        write_document_lattices(directory, prefix, ss)

def run(directory, input):
    write_documents_lattices(directory, iter_documents(parse_input(input)))


if __name__ == "__main__":
    run("out", sys.stdin)

    
