###########################################################################
##                                                                       ##
##                  Language Technologies Institute                      ##
##                     Carnegie Mellon University                        ##
##                       Copyright (c) 2010-2011                         ##
##                        All Rights Reserved.                           ##
##                                                                       ##
##  Permission is hereby granted, free of charge, to use and distribute  ##
##  this software and its documentation without restriction, including   ##
##  without limitation the rights to use, copy, modify, merge, publish,  ##
##  distribute, sublicense, and/or sell copies of this work, and to      ##
##  permit persons to whom this work is furnished to do so, subject to   ##
##  the following conditions:                                            ##
##   1. The code must retain the above copyright notice, this list of    ##
##      conditions and the following disclaimer.                         ##
##   2. Any modifications must be clearly marked as such.                ##
##   3. Original authors' names are not deleted.                         ##
##   4. The authors' names are not used to endorse or promote products   ##
##      derived from this software without specific prior written        ##
##      permission.                                                      ##
##                                                                       ##
##  CARNEGIE MELLON UNIVERSITY AND THE CONTRIBUTORS TO THIS WORK         ##
##  DISCLAIM ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, INCLUDING      ##
##  ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO EVENT   ##
##  SHALL CARNEGIE MELLON UNIVERSITY NOR THE CONTRIBUTORS BE LIABLE      ##
##  FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES    ##
##  WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN   ##
##  AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION,          ##
##  ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF       ##
##  THIS SOFTWARE.                                                       ##
##                                                                       ##
###########################################################################
##                                                                       ##
##            Authors: Alok Parlikar                                     ##
##            Email:   aup@cs.cmu.edu                                    ##
##                                                                       ##
###########################################################################
##                                                                       ##
##  Syntactic Phrasing Model                                             ##
##                                                                       ##
###########################################################################
#!/bin/bash
set -e # Stop running script on one error, instead of silently continuing

LANG=C; export LANG

if [ ! "$ESTDIR" ]
then
   echo "environment variable ESTDIR is unset"
   echo "set it to your local speech tools directory e.g."
   echo '   bash$ export ESTDIR=/home/awb/projects/speech_tools/'
   echo or
   echo '   csh% setenv ESTDIR /home/awb/projects/speech_tools/'
   exit 1
fi

if [ ! "$FESTVOXDIR" ]
then
   echo "environment variable FESTVOXDIR is unset"
   echo "set it to your local festvox directory e.g."
   echo '   bash$ export FESTVOXDIR=/home/awb/projects/festvox/'
   echo or
   echo '   csh% setenv FESTVOXDIR /home/awb/projects/festvox/'
   exit 1
fi

. ./etc/voice.defs

if [ $FV_LANG == "us" ]
then
    PHRASYN_LANG="us"
else
    PHRASYN_LANG="non-us"
fi

ACTION=$1
PROMPTFILE=$2
MODE=$3
NTCOUNT=$4
STOPVALUE=$5

if [ -z "$ACTION" ]
then
    echo "Please specify an action. One of: "
    echo "dump_phrasing_tree" "build_phrasing_grammar" "utt_parse" "build_break_prediction_tree" "fill_bnb_probs"
    echo "do_all will run them all"
    exit 2
fi

if [ ! -f "$PROMPTFILE" ]
then
    echo "Please point me to your txt.done.data file"
    exit 1
fi

if [ -z "$MODE" ]
then
    echo "Please specify grammar mode"
    exit 2
fi

if [ -z "$NTCOUNT" ]
then
    echo "Please specify nonterminal count for grammar"
    exit 2
fi

if [ -z "$STOPVALUE" ]
then
    echo "Please specify stop value for CART training."
    exit 2
fi

if [ ! -d "syntax" ]
then
    mkdir syntax
fi

if [ ! -f "festvox/phrasyn.scm" ]
then
    echo "Copying phrasyn.scm"
    cp -p $FESTVOXDIR/src/phrasyn/phrasyn.scm festvox/
fi

if [ ! -f $PROMPTFILE.train.train ]
then
    echo "Creating Traintest sets"
    ./bin/traintest $PROMPTFILE
    ./bin/traintest $PROMPTFILE.train
fi

if [ "$ACTION" == "dump_phrasing_tree" ]
then
    corpusfile="syntax/$FV_VOICENAME.$MODE.corpus"
    $FESTVOXDIR/../festival/bin/festival -b festvox/phrasyn.scm "festvox/$FV_FULLVOICENAME.scm" "(set! cg:phrasyn nil)" "(voice_$FV_FULLVOICENAME )" "(dump_scfg_corpus '$PROMPTFILE.train '$corpusfile '$MODE)"
fi

if [ "$ACTION" == "build_phrasing_grammar" ]
then

    tagfile=$FESTVOXDIR/src/phrasyn/tags/$PHRASYN_LANG.$MODE
    corpusfile="syntax/$FV_VOICENAME.$MODE.corpus"
    
    $ESTDIR/bin/scfg_make -nonterms $NTCOUNT -terms $tagfile -o "syntax/$FV_VOICENAME.$NTCOUNT.$MODE.gram" -values random -domain prob
    $ESTDIR/bin/scfg_train -grammar "syntax/$FV_VOICENAME.$NTCOUNT.$MODE.gram" -corpus $corpusfile -o "syntax/$FV_VOICENAME.$NTCOUNT.$MODE.out" -heap 10000000 -passes 20 -checkpoint 1 | tee "syntax/$FV_VOICENAME.$NTCOUNT.$MODE.out.log"
    # Pick the best grammar from log
    bestpass=`cat syntax/$FV_VOICENAME.$NTCOUNT.$MODE.out.log | grep 'pass' | sed 's/^pass \([0-9]*\) cross entropy \([0-9.]*\).*$/\1 \2/g' | sort -k2,2 -rn | tail -1 | awk '{printf("%03d",$1)}'`
    mv syntax/$FV_VOICENAME.$NTCOUNT.$MODE.out.$bestpass syntax/$FV_VOICENAME.$NTCOUNT.$MODE.out
    ln -sf $FV_VOICENAME.$NTCOUNT.$MODE.out syntax/grammar.$NTCOUNT.$MODE.out
    rm -f syntax/$FV_VOICENAME.$NTCOUNT.$MODE.out.*
fi

if [ "$ACTION" == "utt_parse" ]
then
    corpusfile="syntax/$FV_VOICENAME.$NTCOUNT.$MODE.corpus"
    outdir="syntax/utts.$NTCOUNT.$MODE"
    echo "$outdir"
    if [ ! -d "$outdir" ]
    then
	mkdir $outdir
    fi

    loadgrammar="(set! scfg_grammar (load \"syntax/$FV_VOICENAME.$NTCOUNT.$MODE.out\" t))"

    $FESTVOXDIR/../festival/bin/festival -b festvox/phrasyn.scm "festvox/$FV_FULLVOICENAME.scm" "(set! cg:phrasyn nil)" "(voice_$FV_FULLVOICENAME )" "$loadgrammar" "(utt_parse '$PROMPTFILE '$outdir '$MODE)"

fi

if [ "$ACTION" == "build_break_prediction_tree" ]
then
    outdir="syntax/break_prediction_tree.$NTCOUNT.$MODE"
    if [ ! -d "$outdir" ]
    then
	mkdir $outdir
    fi

    featsfile=$FESTVOXDIR/src/phrasyn/desc/$PHRASYN_LANG.$MODE.feats
    descfile=$FESTVOXDIR/src/phrasyn/desc/$PHRASYN_LANG.$MODE.desc

    # End Write feats and desc file

    cat "$PROMPTFILE.train.train" | awk -v ntcount="$NTCOUNT" -v mode="$MODE" '{printf("syntax/utts.%s.%s/%s.utt\n",ntcount, mode, $2)}' > $outdir/uttstrainfile 
    cat "$PROMPTFILE.train.test" | awk -v ntcount="$NTCOUNT" -v mode="$MODE" '{printf("syntax/utts.%s.%s/%s.utt\n",ntcount, mode, $2)}' > $outdir/uttsdevfile 
    
    $ESTDIR/../festival/examples/dumpfeats -relation Word -feats $featsfile -from_file $outdir/uttstrainfile -output $outdir/break.train.data -eval festvox/phrasyn.scm
    $ESTDIR/../festival/examples/dumpfeats -relation Word -feats $featsfile -from_file $outdir/uttsdevfile -output $outdir/break.dev.data -eval festvox/phrasyn.scm

    boost20='{if ($1 == "B"){ print $0,20;} else {print $0,1}}'
    boost1='{print $0,1}'

    echo $boost20 > $outdir/boost20.awk
    echo $boost1 > $outdir/boost1.awk

    # Don't train for BB's -- they are free. 
    cat $outdir/break.train.data | grep -v '^BB' | awk -f $outdir/boost1.awk > $outdir/break20.train.data
    cat $outdir/break.dev.data | grep -v '^BB' | awk -f $outdir/boost1.awk > $outdir/break20.dev.data

    $ESTDIR/bin/wagon -heap 10000000 -desc $descfile -stop $STOPVALUE \
    	-data $outdir/break20.train.data -o $outdir/break_prediction.tree -test $outdir/break20.dev.data -stepwise -swopt B_NB_F1

    # Test the built tree
    cat $PROMPTFILE.test | awk -v ntcount="$NTCOUNT" -v mode="$MODE" '{printf("syntax/utts.%s.%s/%s.utt\n",ntcount, mode, $2)}' > $outdir/testutts
    $ESTDIR/../festival/examples/dumpfeats -relation Word -feats $featsfile -from_file $outdir/testutts -output $outdir/test.data -eval festvox/phrasyn.scm
    
    # Don't test over BB's --> they are free.
    cat $outdir/test.data | awk '{print $0, 1;}' | grep -v '^BB'  > $outdir/test20.data

    $ESTDIR/bin/wagon_test -heap 10000000 -desc $descfile -data $outdir/test20.data -tree $outdir/break_prediction.tree -o $outdir/test20.predicted

    ln -sf break_prediction_tree.$NTCOUNT.$MODE/break_prediction.tree syntax/break_prediction.$NTCOUNT.$MODE.tree

    echo "Tree located in $outdir/break_prediction.tree"
fi

if [ "$ACTION" == "fill_bnb_probs" ]
then
    cat syntax/break_prediction_tree.$NTCOUNT.$MODE/break20.train.data | cut -f 1 -d " " | awk '
     # Print list of word frequencies
     BEGIN {tot = 0}
     {
         for (i = 1; i <= NF; i++)
             freq[$i]++;
         tot = tot + NF
     }
     
     END {
         for (word in freq)
             printf "%s\t%f\t%d\n", word, (freq[word])/tot, freq[word]
     }' | sort --key=2 -nr | sort > syntax/break_prediction_tree.$NTCOUNT.$MODE/break20.train.probs
    bprob=$(cat syntax/break_prediction_tree.$NTCOUNT.$MODE/break20.train.probs | awk '$1=="B" {print $2}')
    nbprob=$(cat syntax/break_prediction_tree.$NTCOUNT.$MODE/break20.train.probs | awk '$1=="NB" {print $2}')

    sed -iorig "s/__BPROB__/$bprob/g" festvox/${FV_VOICENAME}_phrasing.scm
    sed -i "s/__NBPROB__/$nbprob/g" festvox/${FV_VOICENAME}_phrasing.scm
fi

if [ "$ACTION" == "do_all" ]
then
    for i in "dump_phrasing_tree" "build_phrasing_grammar" "utt_parse" "build_break_prediction_tree" "fill_bnb_probs"
    do
	$0 $i $PROMPTFILE $3 $4 $5
    done
fi
