/* $Id: bwt.cpp 89 2004-12-14 20:22:17Z towi $ */

#include <stdio.h>
#include <fcntl.h>
#include <stdlib.h>
#include <string.h>
#include <time.h>
#include <sys/stat.h>
#include <locale.h>

#ifdef __GNUG__
#include <unistd.h>
#endif

#include "bwt.h"
#include "output.h"
#include "search.h"
#include "compressor.h"
#include "ds.h"
#include "timer.h"


/****************************************************************
* Outside defines used here:
*   STATISTICS -- print rle compression rate of bwt column L
*   FILEWRITE_INDEX -- write bwt vector EL from genidx()
*   NO_PROGRESS_DOTS -- no dot printing for better timings
*   _WAIT_FOR_KEYPRESS -- wait right before program ends 
* Useful only during debugging phase:
*   DEBUG -- compare results and print stuff
*   FULLDEBUG -- print even more lots of stuff
****************************************************************/


#define FOUND { \
   if (found < 0) \
   printf(" nicht gefunden\n"); \
   else \
   printf(" gefunden an Position %d\n", found); \
}

static const char *projectname = "BWT-Search";
static const char *projectversion = "$Id: bwt.cpp 89 2004-12-14 20:22:17Z towi $";
char **M;
int Mlen;
int C[256];
int *EL;

/* Um das wievielte Auftreten des Zeichens s[n]
* handelt es sich in der Zeichenkette s?
*/
int getNumOfOccurrence(char *s, int n)
{
   char ch = s[n];
   int i, count;
   for (i = 0, count = 0; i < n; i++)
   {
      if (s[i] == ch)
         count++;
   }
   return count;
}


/* An wievielter Stelle in s
* steht das n-te Auftreten von ch?
*/
int posOfNthOccurrence(char *s, int n, char ch)
{
   int i, count;
   for (i = 0, count = 0; (i < Mlen) && (count <= n); i++)
   {
      if (s[i] == ch)
         count++;
   }
   return i - 1;
}


/* Das Zeichen a gegen das Zeichen b vergleichen
*/
int comparechr(const void *a, const void *b)
{
   const char *pa = (const char *) a;
   const char *pb = (const char *) b;
   if (*pa < *pb)
      return -1;
   else if (*pa > *pb)
      return 1;
   return 0;
}


/* Aus (L, I) den Ursprungsstring T reproduzieren
*/
char *BWT_decode(BWT *bwt)
{
   // L nach E kopieren, dann E zeichenweise sortieren
   char *E = new char(Mlen);
   memcpy(E, bwt->L, Mlen);
   E[Mlen] = '\0';
   qsort(E, Mlen, sizeof(char), comparechr);

#ifdef FULLDEBUG
   printf("\nF = '%s'", E);
#endif

   // Aus F und I den Ursprungsstring T reproduzieren
   char *T = new char(Mlen);
   T[Mlen] = '\0';

   // Die Verwendung der Routinen countOccurrence() und
   // posOfNthOccurrence() an dieser Stelle dient nur
   // der Demonstration der Vorgehensweise bei der Rck-
   // transformation. Viel effizienter wre es, hier den
   // in generateIndex() zur Erzeugung von FL verwendeten
   // Algorithmus einzusetzen.
   int j = bwt->I;
   for (int i = 0; i < Mlen; i++)
   {
      T[i] = E[j];
      int occ = getNumOfOccurrence(E, j);
      j = posOfNthOccurrence(bwt->L, occ, E[j]);
   }
   delete [] E;
   return T;
}


/* Das Array C und den Transformationsvektor EL erzeugen.
*/
void BWT_genidx(const char *L, int *EL)
{
   static int histo[256];
   static int pos[256];
   int i;

   // Hufigkeiten aller Zeichen auf Null setzen
   for (i = 0; i < 256; i++)
      histo[i] = 0;

   // Hufigkeiten aller in L enthaltenen Zeichen zhlen
   for (i = 0; i < Mlen; i++)
      histo[(int) L[i]]++;

   // C[] aus dem Histogramm in histo[] berechnen.
   // Das Zeichen mit dem Code 0 steht zwangslufig an
   // vorderster Stelle
   C[0] = 0;
   for (i = 1; i < 256; i++)
      C[i] = C[i-1] + histo[i-1];

   // gesamtes Array C in pos sichern, weil der
   // darauf folgende Schritt pos modifiziert
   memcpy(pos, C, 256 * sizeof(int));

   // Transformationsvektor EL berechnen:
   for (i = 0; i < Mlen; i++)
      EL[pos[(int) L[i]]++] = i;
}


/* Das Muster P suchen
*/
int BWT_search(const char *P, int Plen)
{
   // Beginnen mit einem leeren Suchbegriff,
   // d.h. einem Suchbereich in EL, der sich
   // ber die Lnge des Ursprungstexts erstreckt.
   int matched_start = 0;
   int matched_end   = Mlen - 1;

   // den Suchbegriff sukzessive um ein Zeichen
   // nach links erweitern; den Bereich in EL
   // dadurch einschrnken
   for (int i = Plen - 1; (i >= 0) && (matched_start <= matched_end); i--)
   {
      int c = (int) P[i]; 
      // binre Suche in EL nach matched_start
      int n = -1;
      int low = C[c];
      int high = C[c+1]-1;
      int match = FALSE;
      while (low <= high)
      {
         n = (low + high) / 2;
         if (matched_start < EL[n])
            high = n - 1;
         else
            if (matched_start > EL[n])
               low = n + 1;
            else
            {
               match = TRUE;
               break;
            }
      }
      if (n < 0)
         return -1;
      matched_start = (match) ? n : low;

      // binre Suche in EL nach matched_end
      n = -1;
      low = C[c];
      high = C[c+1]-1;
      match = FALSE;
      while (low <= high)
      {
         n = (low + high) / 2;
         if (matched_end < EL[n])
            high = n - 1;
         else
            if (matched_end > EL[n])
               low = n + 1;
            else
            {
               match = TRUE;
               break;
            }
      }
      if (n < 0)
         return -1; // nicht gefunden
      matched_end = (match) ? n : high;
   }
   // Anzahl gefundener Muster zurckgeben
   return matched_end - matched_start + 1;
}


/*
*/
void usage()
{
   printf("\nAufruf: bwt datei muster [ iterationen ]"
      "\nBeispiel: bwt beispiel.txt muster 100000\n");
}


/* Los geht's ...
*/
int main(int argc, char *argv[])
{
   int found = -1;
   int iterator;
   struct stat fattr;
   TIMER stopwatch;
   long cpu_time_used_bf = -1;
   long cpu_time_used_bf_safe = -1;
   long cpu_time_used_bm = -1;
   long cpu_time_used_bm_idx = -1;
   long cpu_time_used_qs = -1;
   long cpu_time_used_qs_idx = -1;
   long cpu_time_used_ts = -1;
   long cpu_time_used_ts_idx = -1;
   long cpu_time_used_bwt = -1;
   long cpu_time_used_bwt_enc = -1;
   long cpu_time_used_bwt_idx = -1;
#  ifdef DEBUG
   char *E;
#  endif
#  ifdef STATISTICS
   char *rle;
#  endif

   printf("%s %s\n\n", projectname, projectversion);
   if (argc < 3)
   {
      usage();
      return 1;
   }
   if (stat(argv[1], &fattr) != 0)
   {
      perror("can't stat() file");
      return 1;
   }
   BWT *bwt = (BWT *) malloc(sizeof(BWT));  // (L, I)
   char *P = argv[2];
   int Plen = strlen(P);
   int iterations = (argc > 3) ? atoi(argv[3]) : 100000;
   printf("Lesen der Datei '%s' .. ", argv[1]);
   fflush(stdout);
   ds_overshoot = init_ds_ssort(500, 2000);
   if (ds_overshoot <= 0)
   {
      fprintf(stderr, "fail overshoot. 'impossible' ds_ssort lib error.\n");
      exit(-1);
   }
   char *T = (char *) malloc(fattr.st_size + ds_overshoot);
   memset(T, '\0', fattr.st_size + ds_overshoot);

   FILE *fp = fopen(argv[1], "r");
   Mlen = fread(T, sizeof(char), fattr.st_size, fp);
   fclose(fp);
   printf("%d Byte\n", Mlen);

#ifdef DEBUG
   // Ursprungstext ausgeben
   printf("\nT = '%s'\n", T);
#endif

   START("BWT_encode(deep_shallow)");
   BWT_encode_DeepShallow(T, bwt);
   STOP(cpu_time_used_bwt_enc);
   printf("\n");

   EL = (int *) malloc(Mlen * sizeof(int));
   START("BWT_genidx()");
   ITERATE {
      PROGRESS;
      BWT_genidx(bwt->L, EL);
   }
   STOP(cpu_time_used_bwt_idx);

#ifdef FILEWRITE_INDEX
   // Schreiben der binren Daten aus EL in die Datei "EL.dat" 
   fp = fopen("EL.dat", "w+");
   fwrite(EL, Mlen, sizeof(int), fp);
   fclose(fp);
#endif

#ifdef STATISTICS
   // Kompressionsrate fr L ermitteln und ausgeben
#ifdef DEBUG
   printf("\nL (vorher)  = '%s'", bwt->L);
#endif
   rle = RLE_compress(bwt->L);
#ifdef DEBUG
   printf("\nL (nachher) = '%s'", RLE_decompress(rle));
#endif
   printf("\nKompressionsrate von rle(L)/T: %d/%d = %.2f%%\n", 
      strlen(rle),
      strlen(T),
      100 * (float) strlen(rle) / (float) strlen(T));
#endif

#ifdef DEBUG
   // (L, I) ausgeben 
   printf("\n\nI = %d"
      "\n\nL = '%s'",
      bwt->I,
      bwt->L);
   // E durch Sortieren von L berechnen
   E = (char *) malloc(Mlen);
   memcpy(E, bwt->L, Mlen);
   E[Mlen] = '\0';
   qsort(E, Mlen, sizeof(char), comparechr);
   // Transformationsvektor EL ausgeben
   printf("\n i E EL\n");
   for (int i = 0; i < Mlen; i++)
      printf("%2d %c %2d\n", i, E[i], EL[i]);
   // rekonstruierten Ursprungstext ausgeben
   printf("\n\nT = '%s'\n", BWT_decode(bwt));
#endif

   printf("\n%d Suchlufe nach '%s':\n", iterations, P);

   START("Brute-Force-Suche");
   ITERATE {
      PROGRESS;
      found = bf_search(T, P, Plen);
   }
   STOP(cpu_time_used_bf); FOUND;

   START("Brute-Force-Safe!");
   ITERATE {
      PROGRESS;
      found = bf_search_safe(T, P, Mlen, Plen);
   }
   STOP(cpu_time_used_bf_safe); FOUND;

   START("Boyer-Moore-Index");
   ITERATE {
      PROGRESS;
      bm_init(P, Plen);
   }
   STOP(cpu_time_used_bm_idx);
   printf("\n");
   START("Boyer-Moore-Suche");
   ITERATE {
      PROGRESS;
      found = bm_search(T, P, Plen);
   }
   STOP(cpu_time_used_bm);
   FOUND;

   START("Quicksearch-Index");
   ITERATE {
      PROGRESS;
      qs_init(P, Plen);
   }
   STOP(cpu_time_used_qs_idx);
   printf("\n");
   START("Quicksearch-Suche");
   ITERATE {
      PROGRESS;
      found = qs_search(T, P, Plen);
   }
   STOP(cpu_time_used_qs);
   FOUND;

   START("Turbosearch-Index");
   ITERATE {
      PROGRESS;
      ts_init(P, Plen);
   }
   STOP(cpu_time_used_ts_idx);
   printf("\n");
   START("Turbosearch-Suche");
   ITERATE {
      PROGRESS;
      found = ts_search(T, P, Plen);
   }
   STOP(cpu_time_used_ts);
   FOUND;

   START("BWT-Index        ");
   ITERATE {
      PROGRESS;
      found = BWT_search(P, Plen);
   }
   STOP(cpu_time_used_bwt);
   if (found <= 0)
      printf(" nicht gefunden\n");
   else
      printf(" %d-mal gefunden\n", found);

   printf("\nLaufzeiten fr jeweils %d Iterationen in ms",
      iterations);
   printf("\n               search +  genidx");
   printf("\nBrute-Force:  %7ld", cpu_time_used_bf);
   printf("\nBrute-Safe!:  %7ld", cpu_time_used_bf_safe);
   printf("\nBoyer-Moore:  %7ld + %7ld", cpu_time_used_bm, cpu_time_used_bm_idx);
   printf("\nQuicksearch:  %7ld + %7ld", cpu_time_used_qs, cpu_time_used_qs_idx);
   printf("\nTurbosearch:  %7ld + %7ld", cpu_time_used_ts, cpu_time_used_ts_idx);
   printf("\nBWT-Index  :  %7ld + %7ld + %7ld", cpu_time_used_bwt, cpu_time_used_bwt_enc, cpu_time_used_bwt_idx);

   delete bwt;

#ifdef _WAIT_FOR_KEYPRESS
   printf("\n\nZum Beenden Enter drcken .. ");
   fflush(stdout);
   getchar();
#endif
   printf("\n");
   return 0;
}
