Language Model

proc lmAccu { langmod text } { 
        upvar \$langmod l 
        incr l(cnt) [llength \$text] 
        set v [lindex \$text 0] 
        for { set x 1 } { \$x < [llength \$text] } { incr x } { 
             set w [lindex \$text \$x] ; set q \$v,\$w
             if {![info exist l(1,\$v)]} {set l(1,\$v) 1 ; incr l(m)} else {incr l(1,\$v)} 
             if {![info exist l(2,\$q)]} {set l(2,\$q) 1 ; incr l(b)} else {incr l(2,\$q)} 
             set v \$w 
        } 
} 

proc lmUpdate { langmod } { 
        upvar \$langmod l ; set disc 0.01 
        foreach i [array names l] { 
             regexp {(.*),(.*)} \$i dummy n v ; regexp {(.*),(.*),(.*)} \$i dummy n v w 
             if {\$n == 1} {set l(p,\$i) [expr log((\$l(\$i)-\$disc)/\$l(cnt)) /2.30259]} 
             if {\$n == 2} {set l(p,\$i) [expr log((\$l(\$i)-\$disc)/\$l(1,\$v))/2.30259]} 
        } 
} 

proc lmWrite { langmod filename } {
        upvar \$langmod l 
        set f [open \$filename w] ; set mlist {} ; set blist {} 

        foreach i [array names l] {
             if [regexp {p,1,(.*)} \$i d v ] { lappend mlist "\$v \$l(\$i) 0.0" } 
             if [regexp {p,2,(.*),(.*)} \$i d v w] { lappend blist "\$v \$w \$l(\$i)" } 
        } 
        set mlist [lsort -ascii \$mlist] ; set blist [lsort -ascii \$blist] 
        puts \$f "\\\data\\\\\\nngram 1=\$l(m)\\nngram 2=\$l(b)\\n\\n\\\1-grams:" 
        foreach m \$mlist { puts \$f "[lindex \$m 1] [lindex \$m 0] [lindex \$m 2]" } 
        puts \$f "\\n\\\\2-grams:" 
        foreach b \$blist { puts \$f "[lindex \$b 2] [lindex \$b 0] [lindex \$b 1]" } 
        puts \$f "\\\end\\\" 
        close \$f 
} 

[DBase db] open ../step1/db.dat ../step1/db.idx -mode r 

set lm(cnt) 0 ; set lm(m) 1 ; set lm(b) 0 ; set lm(1,</s>) 1

 foreach utt [db] { 
      makeArray arr [db get \$utt] 
      set text [concat {<s>} \$arr(TEXT) {</s>}] 
      puts "\$utt \$text" 
      lmAccu lm \$text 
} 

lmUpdate lm 
lmWrite lm langmod 

exit