#!/bin/sh
#####################################################-*-mode:shell-script-*-
##                                                                       ##
##                     Carnegie Mellon University                        ##
##                         Copyright (c) 2009                            ##
##                        All Rights Reserved.                           ##
##                                                                       ##
##  Permission is hereby granted, free of charge, to use and distribute  ##
##  this software and its documentation without restriction, including   ##
##  without limitation the rights to use, copy, modify, merge, publish,  ##
##  distribute, sublicense, and/or sell copies of this work, and to      ##
##  permit persons to whom this work is furnished to do so, subject to   ##
##  the following conditions:                                            ##
##   1. The code must retain the above copyright notice, this list of    ##
##      conditions and the following disclaimer.                         ##
##   2. Any modifications must be clearly marked as such.                ##
##   3. Original authors' names are not deleted.                         ##
##   4. The authors' names are not used to endorse or promote products   ##
##      derived from this software without specific prior written        ##
##      permission.                                                      ##
##                                                                       ##
##  CARNEGIE MELLON UNIVERSITY AND THE CONTRIBUTORS TO THIS WORK         ##
##  DISCLAIM ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, INCLUDING      ##
##  ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO EVENT   ##
##  SHALL CARNEGIE MELLON UNIVERSITY NOR THE CONTRIBUTORS BE LIABLE      ##
##  FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES    ##
##  WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN   ##
##  AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION,          ##
##  ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF       ##
##  THIS SOFTWARE.                                                       ##
##                                                                       ##
###########################################################################
##                                                                       ##
##  Find simple POS classes from unlabeled data in language              ##
##  Its good to have at least thousands of utterances to start with      ##
##                                                                       ##
##  find_gpos DATA.txt                                                   ##
##                                                                       ##
###########################################################################

LANG=C; export LANG

PREV="00"
awk '{printf(" %s \n",$0)}' $1 |
sed 's/,/_/g' >data_${PREV}.txt

NUMITER=20
echo 1 $NUMITER |
awk '{for (i=$1; i<=$2; i++)
         printf("%02d\n",i);}' |
while read i
do
    cat data_${PREV}.txt |
    awk 'BEGIN {tot=0}
         {pp=0; p=0;
         for (i=1; i<=NF; i++)
         {
            freq[sprintf("%s %s %s",pp,p,$i)]++;
            pp=p; p=$i
         }
         tot+=NF;
         }
         END { for (word in freq)
                 printf "%s %f %d\n", word, (freq[word])/tot, freq[word] }' >data_${i}.ngram

   echo "0" >base_e
   base_e=`cat base_e`
   best_x=_nonex_
   best_y=_noney_
   echo Base_e $base_e iteration $i

   cat data_${PREV}.txt |
   awk '{
           for (i = 1; i <= NF; i++)
               freq[$i]++
        }
       END {
           for (word in freq)
               printf "%s\t%d\n", word, freq[word]
       }' | sort --key=2 -nr |
   grep -v OOV |
   grep -v "[()]" |
   awk '{ if (NR < 51) x[NR] = $1}
        END {for (i=1; i<=15; i++)
              for (j=i+1; j<30; j++)
              {
                 print x[i];
                 print x[j];
              }}' |
   while read x
   do
      read y

      cat data_${i}.ngram |
      awk 'BEGIN {x="'$x'"; y="'$y'";}
           { if (($2 == x) || ($2 == y))
             {
                freq_x[sprintf("%s-%s",$1,$3)]+=$5;
                freq_y[sprintf("%s-%s",$1,$3)]+=$5;
                freq[sprintf("%s-%s",$1,$3)]+=1;
             }
           }
           END { t=0;
                 for (b in freq)
                 {
                    if (freq[b] > 1)
                       t+=freq[b];
                 }
                 xwcount=1;
                 for (i=1; i<length(x); i++)
                    if (substr(x,i,1) == ",")
                        xwcount++;
                 ywcount=1;
                 for (i=1; i<length(y); i++)
                    if (substr(y,i,1) == ",")
                        ywcount++;
                 print t/(xwcount*ywcount)
               }' >this_e
      this_e=`cat this_e`
#      echo Testing $x $y $this_e $c_base
      better=`echo $this_e $base_e | awk '{if ($1 > $2) print "better"; else print "notbetter"}'`
      if [ $better = "better" ]
      then
         cat data_${PREV}.txt |
         awk 'BEGIN {x="'$x'"; y="'$y'";}
              { for (i=1; i<=NF; i++)
                {
                   if (($i == x) || ($i == y))
                      printf("%s,%s ",x,y);
                   else
                      printf("%s ",$i);
                }
                printf("\n");
              }' >data_${i}_best.txt
         echo Better $x $y $this_e $base_e $i
         echo $x >base_x
         echo $y >base_y
         echo $this_e >base_e
         base_e=$this_e
      fi
   done
   base_e=`cat base_e`
   best_x=`cat base_x`
   best_y=`cat base_y`
   echo Best $base_e $best_x $best_y
   mv data_${i}_best.txt data_${i}.txt
   cat data_${i}.txt |
   awk '{for (i=1; i<=NF; i++)
            if ($i ~ /.*,.*/)
               freq[$i]++;
        }
        END {for (x in freq)
             {
                tag=x;
                for (i=1; i<=length(tag); i++)
                {
                   if (substr(tag,i,1) == ",")
                      printf(" ");
                   else
                      printf("%s",substr(tag,i,1));
                }
                printf("\n");
             }}' >tagset_${i}.txt
   rm data_${i}.ngram
   rm data_${PREV}.txt
   PREV=$i
   # Need better stopping criteria
   if [ $base_e = "0" ]
   then
      exit
   fi
done

rm data_${PREV}.txt
rm -f base_e base_x base_y this_e

