proc lmAccu { langmod text } {
upvar \$langmod l
incr l(cnt) [llength \$text]
set v [lindex \$text 0]
for { set x 1 } { \$x < [llength \$text] } { incr x } {
set w [lindex \$text \$x] ; set q \$v,\$w
if {![info exist l(1,\$v)]} {set l(1,\$v) 1 ; incr l(m)} else {incr l(1,\$v)}
if {![info exist l(2,\$q)]} {set l(2,\$q) 1 ; incr l(b)} else {incr l(2,\$q)}
set v \$w
}
}
proc lmUpdate { langmod } {
upvar \$langmod l ; set disc 0.01
foreach i [array names l] {
regexp {(.*),(.*)} \$i dummy n v ; regexp {(.*),(.*),(.*)} \$i dummy n v w
if {\$n == 1} {set l(p,\$i) [expr log((\$l(\$i)-\$disc)/\$l(cnt)) /2.30259]}
if {\$n == 2} {set l(p,\$i) [expr log((\$l(\$i)-\$disc)/\$l(1,\$v))/2.30259]}
}
}
proc lmWrite { langmod filename } {
upvar \$langmod l
set f [open \$filename w] ; set mlist {} ; set blist {}
foreach i [array names l] {
if [regexp {p,1,(.*)} \$i d v ] { lappend mlist "\$v \$l(\$i) 0.0" }
if [regexp {p,2,(.*),(.*)} \$i d v w] { lappend blist "\$v \$w \$l(\$i)" }
}
set mlist [lsort -ascii \$mlist] ; set blist [lsort -ascii \$blist]
puts \$f "\\\data\\\\\\nngram 1=\$l(m)\\nngram 2=\$l(b)\\n\\n\\\1-grams:"
foreach m \$mlist { puts \$f "[lindex \$m 1] [lindex \$m 0] [lindex \$m 2]" }
puts \$f "\\n\\\\2-grams:"
foreach b \$blist { puts \$f "[lindex \$b 2] [lindex \$b 0] [lindex \$b 1]" }
puts \$f "\\\end\\\"
close \$f
}
[DBase db] open ../step1/db.dat ../step1/db.idx -mode r
set lm(cnt) 0 ; set lm(m) 1 ; set lm(b) 0 ; set lm(1,</s>) 1
foreach utt [db] {
makeArray arr [db get \$utt]
set text [concat {<s>} \$arr(TEXT) {</s>}]
puts "\$utt \$text"
lmAccu lm \$text
}
lmUpdate lm
lmWrite lm langmod
exit