/* common.statistics.n-grams.c - statistical tools for analysing the text
 * 
 * This program is part of Crank, a cryptanalysis tool
 * Copyright (C) 2000 Matthew Russell
 *
 * This program is free software; you can redistribute it and/or modify it 
 * under the terms of the GNU General Public License as published by
 * the Free Software Foundation; either version 2 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License (LICENSE) for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program; if not, write to the Free Software
 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307
 * USA
 */

#include "crank.h"
#include "common.statistics.n-grams.h"

/* Global variables */
/*                           {slft, bift, trift} */
const float error_weight[] = {1.0,  1.0,  2.0};

/* Computes a random integer between a and b */
int randnum (int a, int b) {
  return (a + (rand() % (b - a + 1)));
}

float total_error(float slft_error, float bift_error, float trift_error) {
    return (error_weight[0] * slft_error + error_weight[1] * bift_error + error_weight[2] * trift_error);
}

float slft_error(float *slft_std, float *slft_sample) {
    int i; float error = 0.0, diff;
    for (i = 'A'; i <= 'Z'; i++) {
	diff = slft_std[i] - slft_sample[i];
	error += diff * diff;
    }
    return error;
}

float bift_error(float *bift_std, float *bift_sample) {
    int i, j; float error = 0.0, diff;
    for (i = 'A'; i <= 'Z'; i++) {    
	for (j = 'A'; j <= 'Z'; j++) {
	diff = (bift_std + 26 * i)[j] - (bift_sample + 26 * i)[j]; 
	error += diff * diff;
	}
    }
    return error;
}

float trift_error(float *trift_std, float *trift_sample) {
    int i, j, k; float error = 0.0, diff;
    for (i = 'A'; i <= 'Z'; i++) {    
	for (j = 'A'; j <= 'Z'; j++) {
	    for (k = 'A'; k <= 'Z'; k++) {
		diff = (trift_std + 26 * 26 * i + 26 * j)[k] - (trift_sample + 26 * 26 * i + 26 * j)[k]; 
		error += diff * diff;
	    }
	}
    }
    return error;
}

/* Make frequency tables */
int make_ft(char *text, float *slft, float *bift, float *trift) {

    int length = strlen(text), i, j, k, slft_total = 0, bift_total = 0, trift_total = 0;
    char c, pc = 0, ppc = 0;
    int islft[(int) 'Z' + 1];
    int ibift[(int) 'Z' + 1][(int) 'Z' + 1];
    int itrift[(int) 'Z' + 1][(int) 'Z' + 1][(int) 'Z' + 1];
    for (i = (int) 'A'; i <= (int) 'Z'; i++) {
	for (j = (int) 'A'; j <= (int) 'Z'; j++) {
	    for (k = (int) 'A'; k <= (int) 'Z'; k++)
		itrift[i][j][k] = 0;
	    ibift[i][j] = 0;
	}
	islft[i] = 0;
    }
    for (i = 0; i < length; i++) {
	c = text[i];
	if (!isalpha(c))
	    continue;
	c = toupper(c);
	
	islft[(int) c] += 1; slft_total++;
	if (pc) {
	    ibift[(int) pc][(int) c] += 1;
	    bift_total++;
	}
	if (pc & ppc) {
	    itrift[(int) ppc][(int) pc][(int) c] += 1;
	    trift_total++;
	}
	ppc = pc; pc = c;
    }
    for (i = (int) 'A'; i <= (int) 'Z'; i++) {
	for (j = (int) 'A'; j <= (int) 'Z'; j++) {
	    for (k = (int) 'A'; k <= (int) 'Z'; k++) 
		(trift + 26 * 26 * i + 26 * j)[k] = (float) itrift[i][j][k] / (float) trift_total;
	    (bift + 26 * i)[j] = (float) ibift[i][j] / (float) bift_total;
	}
	slft[i] = (float) islft[i] / (float) slft_total;
    }	
    return slft_total; /* i.e. letter_count */
}

/* Duplicates the stat's fts into the argument fts */
void dup_ft(stats *s, float *slft, float *bift, float *trift) {
    int i,j,k;
    for (i = (int) 'A'; i <= (int) 'Z'; i++) {
	for (j = (int) 'A'; j <= (int) 'Z'; j++) {
	    for (k = (int) 'A'; k <= (int) 'Z'; k++) 
		(trift + 26 * 26 * i + 26 * j)[k] = (s->trift + 26 * 26 * i + 26 * j)[k];
	    (bift + 26 * i)[j] = (s->bift + 26 * i)[j];
	}
	slft[i] = s->slft[i];
    }	
}

/* Create a uniform ft into ft_space */
void fallback_ft(float *ft_space, int ft_type) {
    int i,j,k;
    float slft_val = 1.0 / 26.0, bift_val = 1.0 / (26.0 * 26.0), trift_val = 1.0 / (26.0 * 26.0 * 26.0);
    for (i = (int) 'A'; i <= (int) 'Z'; i++)
	if (ft_type == FT_BIFT || ft_type == FT_TRIFT)
	    for (j = (int) 'A'; j <= (int) 'Z'; j++)
		if (ft_type == FT_TRIFT)
		    for (k = (int) 'A'; k <= (int) 'Z'; k++) 
			(ft_space + 26 * 26 * i + 26 * j)[k] = trift_val;
		else
		    (ft_space + 26 * i)[j] = bift_val;
	else
	    ft_space[i] = slft_val;
}

/* Calculate index of coincidence */
float calc_ic(float *slft, int letter_count) {
    float total = 0.0, freq;
    int i;
    for (i = (int) 'A'; i <= (int) 'Z'; i++) {
	freq = slft[i];
	total += freq * ( ((float) letter_count) * freq - 1) / (float) (letter_count - 1);
    }
    return total;
}

/* Calculate entropy */
float calc_entropy(float *slft) {
    float total = 0.0, freq;
    int i;
    for (i = (int) 'A'; i <= (int) 'Z'; i++) {
	freq = slft[i];
	if (freq > 0.0)
	    total -= freq * log(freq); 
    }
    return total;
}

/* Calculate efficiency */
float calc_efficiency(float entropy) {
    return entropy / log(26.0);
}

/* Calculate redundancy */
float calc_redundancy(float efficiency) {
    return 1.0 - efficiency;
}

/* Assumes key and stats are complete */
stats *transform_stats_with_key(stats *s, key *key, float *slft_std, float *bift_std, float *trift_std) {
    int i, j, k, it, jt, kt;
    const int to_upper = 'A' - 'a';
    float *slft = malloc(((int) 'Z' + 1) * sizeof(float));
    float *bift = malloc(((int) 'Z' + 1) * ((int) 'Z' + 1)  * sizeof(float));
    float *trift = malloc(((int) 'Z' + 1) * ((int) 'Z' + 1) * ((int) 'Z' + 1) * sizeof(float));
    stats *new_stats = malloc( sizeof(stats));
    
    for (i = (int) 'A'; i <= (int) 'Z'; i++) {
	it = (*key)[i] + to_upper;
	for (j = (int) 'A'; j <= (int) 'Z'; j++) {
	    jt = (*key)[j] + to_upper;
	    for (k = (int) 'A'; k <= (int) 'Z'; k++) {
		kt = (*key)[k] + to_upper;
		(trift + 26 * 26 * it + 26 * jt)[kt] = (s->trift + 26 * 26 * i + 26 * j)[k] ;
	    }
	    (bift + 26 * it)[jt] = (s->bift + 26 * i)[j]; 
	}
	slft[it] = s->slft[i];
    }
    new_stats->slft = slft;
    new_stats->bift = bift;
    new_stats->trift = trift;

    new_stats->slft_error = slft_error(slft_std, new_stats->slft);
    new_stats->bift_error = bift_error(bift_std, new_stats->bift);
    new_stats->trift_error = trift_error(trift_std, new_stats->trift);
    new_stats->total_error = total_error(new_stats->slft_error, new_stats->bift_error, new_stats->trift_error);
    new_stats->letter_count =  s->letter_count;
    new_stats->ic = s->ic;
    new_stats->entropy = s->entropy;
    new_stats->efficiency = s->efficiency;
    new_stats->redundancy = s->redundancy;
    return new_stats;
}

/* Calculate stats for a section of text */
stats *make_stats(char *text, float *slft_std, float *bift_std, float *trift_std) {
    float *slft = malloc(((int) 'Z' + 1) * sizeof(float));
    float *bift = malloc(((int) 'Z' + 1) * ((int) 'Z' + 1)  * sizeof(float));
    float *trift = malloc(((int) 'Z' + 1) * ((int) 'Z' + 1) * ((int) 'Z' + 1) * sizeof(float));
   
    stats *new_stats = malloc( sizeof(stats));
    new_stats->letter_count = make_ft(text, slft, bift, trift);
    new_stats->slft = slft;
    new_stats->bift = bift;
    new_stats->trift = trift;

    new_stats->slft_error = slft_error(slft_std, new_stats->slft);
    new_stats->bift_error = bift_error(bift_std, new_stats->bift);
    new_stats->trift_error = trift_error(trift_std, new_stats->trift);
    new_stats->total_error = total_error(new_stats->slft_error, new_stats->bift_error, new_stats->trift_error);

    new_stats->ic = calc_ic(slft, new_stats->letter_count);
    new_stats->entropy = calc_entropy(slft);
    new_stats->efficiency = calc_efficiency(new_stats->entropy);
    new_stats->redundancy = calc_redundancy(new_stats->efficiency);
    return new_stats;
}

void free_stats(stats *the_stats) {
    if (!the_stats)
	return;
    if (the_stats->slft)
	free(the_stats->slft);
    if (the_stats->bift)
	free(the_stats->bift);
    if (the_stats->trift)
	free(the_stats->trift);
    free(the_stats);
}

stats *dup_stats(stats *the_stats) {
    float *slft = malloc(((int) 'Z' + 1) * sizeof(float));
    float *bift = malloc(((int) 'Z' + 1) * ((int) 'Z' + 1)  * sizeof(float));
    float *trift = malloc(((int) 'Z' + 1) * ((int) 'Z' + 1) * ((int) 'Z' + 1) * sizeof(float));
    
    stats *new_stats = malloc( sizeof(stats));

    dup_ft(the_stats, slft, bift, trift);

    new_stats->slft = slft;
    new_stats->bift = bift;
    new_stats->trift = trift;

    new_stats->slft_error = the_stats->slft_error;
    new_stats->bift_error = the_stats->bift_error;
    new_stats->trift_error = the_stats->trift_error;
    new_stats->total_error = the_stats->total_error;
    new_stats->letter_count = the_stats->letter_count;
    new_stats->ic = the_stats->ic;
    new_stats->entropy = the_stats->entropy;
    new_stats->efficiency = the_stats->efficiency;
    new_stats->redundancy = the_stats->redundancy;
    return new_stats;
}

/* Stats file IO */
/* ------------- */

/* Load default fts */
float *load_slft_std(char *filename) { 
    FILE *inf;
    int i;
    float *slft = malloc(((int) 'Z' + 1) * sizeof(float));
    
    inf = fopen(filename, "r");
    if (!inf) {
	g_warning("Error opening slft file: %s", filename);
	fallback_ft(slft, FT_SLFT);
    } else {
	for (i = 'A'; i <= 'Z'; i++) {
	    if(!(fscanf(inf, "%f", &(slft[i])) == 1)) {
		g_warning("Error in slft file: %s", filename);
		fallback_ft(slft, FT_SLFT);
		break;
	    }
	}
	fclose(inf);
    }

    return slft;
}

float *load_bift_std(char *filename) { 
    FILE *inf;
    int i, j;
    float *bift = malloc(((int) 'Z' + 1) * ((int) 'Z' + 1) * sizeof(float));
    
    inf = fopen(filename, "r");
    if (!inf) {
	g_warning("Error opening bift file: %s", filename);
	fallback_ft(bift, FT_BIFT);
    } else {
	
	for (i = 'A'; i <= 'Z'; i++) {
	    for (j = 'A'; j <= 'Z'; j++) {
		if(!(fscanf(inf, "%f", &((bift + 26 * i)[j])) == 1)) {
		    g_warning("Error in bift file: %s", filename);
		    fallback_ft(bift, FT_BIFT);
		    goto error_read_bift;
		}
	    }
	}
    error_read_bift:
	fclose(inf);
    }
    
    return bift;
}

float *load_trift_std(char *filename) { 
    FILE *inf;
    int i, j, k;
    float *trift = malloc(((int) 'Z' + 1) * ((int) 'Z' + 1) * ((int) 'Z' + 1) * sizeof(float));
    
    inf = fopen(filename, "r");
    if (!inf) {
	g_warning("Error opening trift file: %s", filename);
	fallback_ft(trift, FT_TRIFT);
    } else {
	for (i = 'A'; i <= 'Z'; i++) {
	    for (j = 'A'; j <= 'Z'; j++) {
		for (k = 'A'; k <= 'Z'; k++) {
		    if(!(fscanf(inf, "%f", &((trift + 26 * 26 * i + 26 * j)[k])) == 1)) {
			g_warning("Error in trift file: %s", filename);
			fallback_ft(trift, FT_TRIFT);
			goto error_read_trift;
		    }
		}
	    }
	}
    error_read_trift:
	fclose(inf);
	
    }
    
    return trift;
}

/* Saving defaults fts */
void do_save_slft(float *slft) {
    FILE *outfile;
    int i;
    outfile = fopen(DEFAULT_SLFT, "w");
    if (!outfile)
	g_error("Error: Cannot open data/slft.dat");

    for (i = (int) 'A'; i <= (int) 'Z'; i++) {
	fprintf(outfile, "%.8f\n", slft[i]); 
    }
    fclose(outfile);
}

void do_save_bift(float *bift) {
    FILE *outfile;
    int i, j;
    outfile = fopen(DEFAULT_BIFT, "w");
    if (!outfile)
	g_error("Error: Cannot open data/bift.dat");

    for (i = 'A'; i <= 'Z'; i++) {    
	for (j = 'A'; j <= 'Z'; j++) {
	    fprintf(outfile, "%.8f\n", (bift + 26 * i)[j]); 
	}
    }
    fclose(outfile);
}

void do_save_trift(float *trift) {
    FILE *outfile;
    int i, j, k;
    outfile = fopen(DEFAULT_TRIFT, "w");
    if (!outfile)
	g_error("Error: Cannot open data/trift.dat");

    for (i = 'A'; i <= 'Z'; i++) {    
	for (j = 'A'; j <= 'Z'; j++) {
	    for (k = 'A'; k <= 'Z'; k++) {
		fprintf(outfile, "%.8f\n", (trift + 26 * 26 * i + 26 * j)[k]); 
	    }
	}
    }
    fclose(outfile);
}
