#!/bin/sh
#####################################################-*-mode:shell-script-*-
##                                                                       ##
##                     Carnegie Mellon University                        ##
##                        Copyright (c) 2005                             ##
##                        All Rights Reserved.                           ##
##                                                                       ##
##  Permission is hereby granted, free of charge, to use and distribute  ##
##  this software and its documentation without restriction, including   ##
##  without limitation the rights to use, copy, modify, merge, publish,  ##
##  distribute, sublicense, and/or sell copies of this work, and to      ##
##  permit persons to whom this work is furnished to do so, subject to   ##
##  the following conditions:                                            ##
##   1. The code must retain the above copyright notice, this list of    ##
##      conditions and the following disclaimer.                         ##
##   2. Any modifications must be clearly marked as such.                ##
##   3. Original authors' names are not deleted.                         ##
##   4. The authors' names are not used to endorse or promote products   ##
##      derived from this software without specific prior written        ##
##      permission.                                                      ##
##                                                                       ##
##  CARNEGIE MELLON UNIVERSITY AND THE CONTRIBUTORS TO THIS WORK         ##
##  DISCLAIM ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, INCLUDING      ##
##  ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO EVENT   ##
##  SHALL CARNEGIE MELLON UNIVERSITY NOR THE CONTRIBUTORS BE LIABLE      ##
##  FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES    ##
##  WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN   ##
##  AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION,          ##
##  ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF       ##
##  THIS SOFTWARE.                                                       ##
##                                                                       ##
###########################################################################
##                                                                       ##
##  Do EHMM labeling                                                     ##
##                                                                       ##
###########################################################################

LANG=C; export LANG

if [ ! "$FESTVOXDIR" ]
then
   echo "environment variable FESTVOXDIR is unset"
   echo "set it to your local festvox directory e.g."
   echo '   bash$ export FESTVOXDIR=/home/awb/projects/festvox/'
   echo or
   echo '   csh% setenv FESTVOXDIR /home/awb/projects/festvox/'
   exit 1
fi

export EHMMDIR=$FESTVOXDIR/src/ehmm

. etc/voice.defs

if [ "$PROMPTFILE" = "" ]
then
   PROMPTFILE=etc/txt.done.data
fi

if [ $# = 0 ]
then
	# Make sure wave files actually exist
	wav_exist=1
	filenames=$(awk '{print $2}' $PROMPTFILE)
	for fname in $filenames
	do
		if [ ! -f wav/$fname.wav ]
		then
			wav_exist=0
			break
		fi
	done		

	if [ "$wav_exist" = "0" ]
	then
		echo "Wave files are missing. Aborting EHMM"
		exit 1
	fi

   $0 setup
   $0 phseq
   $0 feats
   $0 bw
   $0 align
   $0 standardize_statenames
   # reclaim some disk space
   rm -rf ehmm/feat ehmm/binfeat
   exit 0
fi

if [ "$1" = "help" ]
then
   echo "setup"
   echo "phseq"
   echo "feats"
   echo "bw"
   echo "align"
   echo "standardize_statenames"
   exit 0
fi


if [ "$1" = "setup" ]
then
    echo "EHMM setup"
    mkdir ehmm
    mkdir ehmm/feat
    mkdir ehmm/etc
    mkdir ehmm/mod

    sample_wav_name=$(awk 'NR==1{print $2}' $PROMPTFILE)
    SAMPFREQ=$($ESTDIR/bin/ch_wave -info wav/$sample_wav_name.wav  | grep 'Sample rate' | cut -d ' ' -f 3)
    FRAMELEN=$(echo | awk "{print int(0.01*$SAMPFREQ)}")
    FRAMESHIFT=$(echo | awk "{print int(0.005*$SAMPFREQ)}")

    cat $EHMMDIR/etc/mysp_settings | \
	sed "s/^SamplingFreq:.*$/SamplingFreq: $SAMPFREQ/g" | \
	sed "s/^FrameSize:.*$/FrameSize: $FRAMELEN/g" | \
	sed "s/FrameShift:.*$/FrameShift: $FRAMESHIFT/g" > ehmm/etc/mysp_settings

    exit 0
fi

if [ "$1" = "phseq" ]
then
    echo "EHMM extract phone sequences and base hmm state numbers"
    $ESTDIR/../festival/bin/festival -b $EHMMDIR/bin/phseq.scm '(phseq "'$PROMPTFILE'" "ehmm/etc/txt.phseq.data")'
    if [ -f etc/ph_list ]
    then
       # Maybe there is an optimized hmm state number list available
       cp -pr etc/ph_list ehmm/etc/ph_list
    else
       perl $EHMMDIR/bin/phfromutt.pl ehmm/etc/txt.phseq.data ehmm/etc/ph_list 5
    fi
    awk 'END {printf("NoOfFiles: %d\n",NR)}' ehmm/etc/txt.phseq.data >ehmm/etc/mywavelist
    awk '{print $1}' ehmm/etc/txt.phseq.data >>ehmm/etc/mywavelist
    exit 0
fi

if [ "$1" = "oldfeats" ]
then
    echo "EHMM feature extraction and normalization"
    # Still needed if you want to use a new model in islice
    if [ ! -d ehmm/feat ]
    then
       mkdir ehmm/feat
    fi
    $EHMMDIR/bin/FeatureExtraction ehmm/etc/mysp_settings ehmm/etc/mywavelist
    perl $EHMMDIR/bin/comp_dcep.pl ehmm/etc/mywavelist ehmm/feat mfcc ft 0 0
          # (0, 0):  delta-cepstrals and delta-delta-cepstrals
    perl $EHMMDIR/bin/scale_feat.pl ehmm/etc/mywavelist ehmm/feat ehmm/mod ft 4
          # (4): Scaling factor 
    perl $EHMMDIR/bin/seqproc.pl ehmm/etc/txt.phseq.data ehmm/etc/ph_list 2 2 13
          #Last params: no.gaussains, no. of connections, feature_dimension

    echo "Converting feats to binary format"
    num_cpus=$(./bin/find_num_available_cpu)
    mkdir ehmm/binfeat
    find ehmm/feat -name '*.ft' -print0 | \
	xargs -0 -n1 basename | \
	xargs -n1 -P $num_cpus -I {} $EHMMDIR/bin/ConvertFeatsFileToBinaryFormat ehmm/feat/{} ehmm/binfeat/{}
    exit 0
fi

if [ "$1" = "feats" ]
then
    echo "EHMM Feature extraction and normalization with new code"
    # Split mywavelist into $num_cpu parts
    num_cpus=$(./bin/find_num_available_cpu)
    nc=`echo $num_cpus | awk '{print $1-1}'`
    for i in $(seq 0 $nc)
    do
	cat ehmm/etc/mywavelist | awk -v part=$i '((NR>1) && (NR%'$num_cpus'==part)){print $0}' > ehmm/etc/mywavelist.part-$i.tmp
        lines=$(wc -l ehmm/etc/mywavelist.part-$i.tmp | awk '{print $1}')
	echo "NoOfFiles: $lines" > ehmm/etc/mywavelist.part-$i
	cat ehmm/etc/mywavelist.part-$i.tmp >> ehmm/etc/mywavelist.part-$i
	rm -f ehmm/etc/mywavelist.part-$i.tmp
    done
    # Run Feature Extraction
    seq 0 $nc | xargs -n1 -P $num_cpus -I{} $EHMMDIR/bin/FeatureExtraction ehmm/etc/mysp_settings ehmm/etc/mywavelist.part-{}

    # Convert Features to Binary
    mkdir ehmm/binfeat
    find ehmm/feat -name '*.mfcc' -print0 | \
        xargs -0 -n1 -I {} basename {} .mfcc | \
        xargs -n1 -P $num_cpus -I {} $EHMMDIR/bin/ConvertFeatsFileToBinaryFormat ehmm/feat/{}.mfcc ehmm/binfeat/{}.ft

    # Cleanup the parts
    rm -rf ehmm/etc/mywavelist.part*

    # Normalize and Scale the feats
    $EHMMDIR/bin/ScaleBinaryFeats ehmm/etc/mywavelist 4 $num_cpus
    # 4 => Scaling Factor
    # Last => Number of threads to spawn

    perl $EHMMDIR/bin/seqproc.pl ehmm/etc/txt.phseq.data ehmm/etc/ph_list 2 2 13
          #Last params: no.gaussains, no. of connections, feature_dimension

    exit 0
fi
if [ "$1" = "bw" ]
then
    echo "EHMM baum-welch re-estimation"
    num_cpus=$(./bin/find_num_available_cpu)
    echo "Using $num_cpu threads"
    $EHMMDIR/bin/ehmm ehmm/etc/ph_list.int ehmm/etc/txt.phseq.data.int 1 0 ehmm/binfeat scaledft ehmm/mod 0 0 0 30 $num_cpus
            #Numbers (1, 0): Sequential Flag, Retrain Flag
            #Numbers (0, 0, 0): Fully-Connected Flag, Perturbation Flag, Skip-Flag
            #Numbers (30, num_cpu): Number of Max Iters, Number of multiple-threads
   exit 0
fi

if [ "$1" = "align" ]
then
    echo "EHMM align"
    num_cpus=$(./bin/find_num_available_cpu)
    #$EHMMDIR/bin/edec ehmm/etc/ph_list.int ehmm/etc/txt.phseq.data.int 1 ehmm/feat ft ehmm/etc/mysp_settings ehmm/mod 0 lab
    $EHMMDIR/bin/edec ehmm/etc/ph_list.int ehmm/etc/txt.phseq.data.int 1 ehmm/binfeat scaledft ehmm/etc/mysp_settings ehmm/mod 0 lab $num_cpus
           # scaledft: use binary feature files
           # (1): Sequential Flag..
	   # (0): nde flag if 1 uses Viterbi_NDE
           # (1): Number of Threads
    perl $EHMMDIR/bin/sym2nm.pl lab ehmm/etc/ph_list.int  #Earlier it was .map
    ###perl $EHMMDIR/bin/check_lab.pl lab ehmm/etc/txt.phseq.data  #commented due to use of short pause..
    if [ ! -f etc/silence ]
    then
       $ESTDIR/../festival/bin/festival -b festvox/build_clunits.scm "(find_silence_name)"
    fi
    SILENCE=`awk '{print $1}' etc/silence`
    cat ehmm/etc/ph_list.int_log |
    awk '{if (NF > 1)
          {
             printf("%s ",$1);
             for (i=4; i<(4+$2-2); i++)
                printf("%s_%s ",$1,$i);
             printf("\n");
          }}' |
    sed 's/ ssil/ '$SILENCE'/g' >etc/statenames
    exit 0
fi

if [ "$1" = "standardize_statenames" ]
then
    echo "EHMM standardize_statenames"
    mv etc/statenames etc/statenames.ehmm
    cat etc/statenames.ehmm |
    sed 's/_[0-9][0-9]* /__&/g' | sed 's/___/ /g' |
    awk '{if (NF >= 4)
          {
             printf("%s %s_1 %s_2 ",$1,$2,$4);
             for (i=6; i<NF; i+=2)
                printf("%s_%d ",$i,i/2);
             printf("\n");
          }
          else
             printf("%s %s_5\n",$1,$2);
         }' >etc/statenames
    cat etc/statenames.ehmm |
    sed 's/_[0-9][0-9]* /__&/g' | sed 's/___/ /g' |
    awk 'BEGIN { printf("BEGIN { \n");}
         {if (NF >= 4)
          {
             printf("statenamemap[%d] = 1;\n",$3);
             printf("statename[%d] = \"%s\";\n",$3,$2);
             printf("statenamemap[%d] = 2;\n",$5);
             printf("statename[%d] = \"%s\";\n",$5,$4);
             for (i=6; i<NF; i+=2)
             {
                printf("statenamemap[%d] = %d;\n",$(i+1),i/2);
                printf("statename[%d] = \"%s\";\n",$(i+1),$i);
             }
          }
          else
          {
             printf("statenamemap[%d] = 5;\n",$3);
             printf("statename[%d] = \"%s\";\n",$3,$2);
          }
         }
         END {printf("}\n");
              printf("{ if (NF < 3)\n");
              printf("     print $0;\n");
              printf("  else\n");
              printf("     printf(\"%%s 125 %%d %%s\\n\",$1,statenamemap[$3],statename[$3])\n");
              printf("}\n");}' >etc/mapstatenames.awk
   cat ${PROMPTFILE} |
   awk '{print $2}' |
   while read x
   do
      mv lab/$x.sl lab/$x.slehmm
      awk -f etc/mapstatenames.awk lab/$x.slehmm >lab/$x.sl
   done
   exit 0
fi

echo do_ehmm: unknown option $1
exit 1


