#!/bin/sh
###########################################################################
##                                                                       ##
##                   Carnegie Mellon University and                      ##
##                   Alan W Black and Kevin A. Lenzo                     ##
##                      Copyright (c) 1998-2000                          ##
##                        All Rights Reserved.                           ##
##                                                                       ##
##  Permission is hereby granted, free of charge, to use and distribute  ##
##  this software and its documentation without restriction, including   ##
##  without limitation the rights to use, copy, modify, merge, publish,  ##
##  distribute, sublicense, and/or sell copies of this work, and to      ##
##  permit persons to whom this work is furnished to do so, subject to   ##
##  the following conditions:                                            ##
##   1. The code must retain the above copyright notice, this list of    ##
##      conditions and the following disclaimer.                         ##
##   2. Any modifications must be clearly marked as such.                ##
##   3. Original authors' names are not deleted.                         ##
##   4. The authors' names are not used to endorse or promote products   ##
##      derived from this software without specific prior written        ##
##      permission.                                                      ##
##                                                                       ##
##  CARNEGIE MELLON UNIVERSITY AND THE CONTRIBUTORS TO THIS WORK         ##
##  DISCLAIM ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, INCLUDING      ##
##  ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO EVENT   ##
##  SHALL CARNEGIE MELLON UNIVERSITY NOR THE CONTRIBUTORS BE LIABLE      ##
##  FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES    ##
##  WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN   ##
##  AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION,          ##
##  ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF       ##
##  THIS SOFTWARE.                                                       ##
##                                                                       ##
###########################################################################
##                                                                       ##
##  This does bigram maximization on an ldom dataset                     ##
##                                                                       ##
##  Run this multiple times until it ceases to reduce, each cycle will   ##
##  push bigram rich sentences to the start of the list                  ##
##                                                                       ##
###########################################################################

if [ $# != 1 ]
then
    echo "Reduce a prompt set by doing bigram maximisation "
    echo "Usage: data_select MUMBLE.DATA"
    exit 1
fi

LANG=C; export LANG

fname=`basename $1`
pass=0

grep "^(" $1 |
sed 's/"/QUOTE /' |
sed 's/"/ QUOTE/' |
awk '{printf("%s ",$2);
      for (i=3; i<NF-1; i++)
         printf("%s_%s ",$i, $(i+1));
      printf("\n")}' > /tmp/$fname.bigrams.$pass

while true
do
   npass=`echo $pass | awk '{print $1+1}'`
   cat /tmp/$fname.bigrams.$pass |
   awk 'function score_bis()
     { 
         score = 0;
         for (i=2; i<=NF; i++)
         {
            if (bis[$i] == 0)
               score++;
         }          
         return score;
     }
     { print score_bis(),$0
       for (i=2; i<=NF; i++)
       {
          bis[$i] += 1;
       }          
     }' | sort -nr | grep -v "^0 " | 
     awk '{for (i=2; i<=NF; i++)
              printf("%s ",$i);
           printf("\n")}' >/tmp/$fname.bigrams.$npass
   passnum=`cat /tmp/$fname.bigrams.$pass | wc -l`
   npassnum=`cat /tmp/$fname.bigrams.$npass | wc -l`
   echo "Pass "$pass" reduced from "$passnum" to "$npassnum
   if [ $passnum = $npassnum ]
   then
       cat /tmp/$fname.bigrams.$npass | awk '{print $1}' |
       while read l
       do
           grep " "$l" " $1
       done >$1.selected          
       rm /tmp/$fname.bigrams.*
       exit
   fi
   pass=$npass
done

   


