Hi can anyone help me convert these codes into java and let me know which classes are to have?
// Clustering.cpp : Defines the entry point for the console application.
//
#include "stdafx.h"
#include <string.h>
#include "MersenneTwister.h"
#define CATAGORICAL_CONST 30.0
#define MAX_FILENAME_SIZE 500
#define MAX_BUFFER_SIZE 500
#define MAX_STRING_SIZE 100
#define MAX_TOKENS 100
#define MAX_POPULATION 20
#define TOKEN_UNUSED 0
#define TOKEN_LINEAR_INTEGER 1
#define TOKEN_LINEAR_FLOAT 2
#define TOKEN_NOMINAL 3
#define MAX_RECORDS 10000
#define MAX_NUMBER_OF_PEAKS 10
#define FREQUENCY_100 101
#define SAMPLE_SIZE 300
struct Field
{
char text[MAX_STRING_SIZE];
float value;
};
struct Chromosome
{
int fitness;
bool genes[MAX_TOKENS];
};
char InputBuffer [MAX_BUFFER_SIZE];
char Tokens[MAX_TOKENS][MAX_STRING_SIZE];
int FieldType[MAX_TOKENS];
float MinFieldType[MAX_TOKENS];
float MaxFieldType[MAX_TOKENS];
Field Records[MAX_RECORDS][MAX_TOKENS];
int allocated_cluster[MAX_RECORDS];
bool g[MAX_TOKENS];
Chromosome population[MAX_POPULATION];
Chromosome child;
int number_of_points;
int number_of_features;
int number_of_troughs;
int troughs[MAX_NUMBER_OF_PEAKS];
int depths [MAX_NUMBER_OF_PEAKS];
int number_of_peaks;
int peaks[MAX_NUMBER_OF_PEAKS];
int heights[MAX_NUMBER_OF_PEAKS];
int total_number_of_records;
int freq_100 [FREQUENCY_100 ];
int freq_100_int[FREQUENCY_100];
int freq_100_ave[FREQUENCY_100];
int next_cluster = -1;
int cluster_sizes[MAX_RECORDS];
MTRand random;
int get_next_cluster()
{
next_cluster++;
return next_cluster;
}
void init_g()
{
int i;
for (i=0; i<MAX_TOKENS; i++)
{
g[i] = true;
}
}
void initFieldType()
{
int i;
for (i=0; i<MAX_TOKENS; i++)
{
FieldType[i] = TOKEN_UNUSED;
}
}
void initAllocatedCluster()
{
int i;
for (i=0; i<MAX_RECORDS; i++)
{
allocated_cluster[i] = -1;
}
}
void initClusterSizes()
{
int i;
for (i=0; i<MAX_RECORDS; i++)
{
cluster_sizes[i] = 0;
}
}
bool getline(FILE *fp, char *buffer)
{
bool rc;
bool collect;
char c;
int i;
rc = false;
collect = true;
i = 0;
while (collect)
{
c = getc(fp);
switch (c)
{
case EOF:
if (i > 0)
{
rc = true;
}
collect = false;
break;
case '\n':
if (i > 0)
{
rc = true;
collect = false;
buffer[i] = '\0';
}
break;
default:
buffer[i] = c;
i++;
break;
}
}
return (rc);
}
int getTokens(char* buffer, char seperator)
{
int i;
int j;
int k;
if (buffer[0] == '\0') return 0;
for (i=0; i<MAX_TOKENS; i++)
{
for (j=0; j<MAX_STRING_SIZE; j++)
{
Tokens[i][j] = '\0';
}
}
i = 0;
j = 0;
k = 0;
while (buffer[i] != '\0')
{
if (buffer[i] == seperator)
{
j++;
k = 0;
}
else
{
Tokens[j][k] = buffer[i];
k++;
}
i++;
}
return j+1;
}
bool isInteger(char* string)
{
int size;
int i;
size = strlen(string);
if (size > 0)
{
if (string[0] == '?') return false;
for (i=0; i<size; i++)
{
if ((string[i] < '0') || (string[i] > '9'))
{
return false;
}
}
}
else
{
return false;
}
return true;
}
bool isFloat(char* string)
{
int number_of_digits = 0;
int number_of_dots = 0;
int size;
int i;
size = strlen(string);
if (size > 0)
{
if (string[0] == '?') return false;
for (i=0; i<size; i++)
{
switch (string[i])
{
case '0':
case '1':
case '2':
case '3':
case '4':
case '5':
case '6':
case '7':
case '8':
case '9':
number_of_digits++;
break;
case '.':
number_of_dots++;
break;
default:
return false;
break;
}
}
}
else
{
return false;
}
if ((number_of_dots == 1) && (number_of_digits > 0))
{
return true;
}
else
{
return false;
}
}
bool isText(char* string)
{
int size;
int i;
size = strlen(string);
if (size > 0)
{
if (string[0] == '?') return false;
for (i=0; i<size; i++)
{
if (((string[i] >= 'a') && (string[i] <= 'z')) || ((string[i] >= 'A') && (string[i] <= 'Z')))
{
return true;
}
}
}
else
{
return false;
}
return false;
}
void setFieldType(char* filename, bool titles_present)
{
FILE *ifp;
int number_of_lines = 0;
int number_of_tokens;
int i;
if ( ( ifp = fopen( filename, "r" ) ) != NULL )
{
while (getline(ifp, InputBuffer))
{
if ((!titles_present) || (number_of_lines > 0))
{
number_of_tokens = getTokens(InputBuffer, ',');
if (number_of_tokens > 0)
{
for (i=0; i<number_of_tokens; i++)
{
switch (FieldType[i])
{
case TOKEN_UNUSED:
if (isText(Tokens[i])) FieldType[i] = TOKEN_NOMINAL;
if (isInteger(Tokens[i])) FieldType[i] = TOKEN_LINEAR_INTEGER;
if (isFloat(Tokens[i])) FieldType[i] = TOKEN_LINEAR_FLOAT;
break;
case TOKEN_LINEAR_INTEGER:
if (isText(Tokens[i])) FieldType[i] = TOKEN_NOMINAL;
if (isFloat(Tokens[i])) FieldType[i] = TOKEN_LINEAR_FLOAT;
break;
case TOKEN_LINEAR_FLOAT:
if (isText(Tokens[i])) FieldType[i] = TOKEN_NOMINAL;
break;
case TOKEN_NOMINAL:
break;
default:
break;
}
}
}
}
number_of_lines++;
}
fclose(ifp);
}
else
{
printf("\n");
printf("\n ABORTING >> Data File %s not found. ", filename);
printf("\n");
}
}
void setMinMax(char* filename, bool titles_present)
{
FILE *ifp;
bool init_min_max = true;
int number_of_lines = 0;
int number_of_tokens;
int i;
float value;
long d_value;
if ( ( ifp = fopen( filename, "r" ) ) != NULL )
{
while (getline(ifp, InputBuffer))
{
if ((!titles_present) || (number_of_lines > 0))
{
number_of_tokens = getTokens(InputBuffer, ',');
if (number_of_tokens > 0)
{
for (i=0; i<number_of_tokens; i++)
{
switch (FieldType[i])
{
case TOKEN_UNUSED:
break;
case TOKEN_LINEAR_INTEGER:
if (sscanf(Tokens[i], "%d", &d_value) == 1)
{
value = (float)d_value;
if (init_min_max)
{
MinFieldType[i] = value;
MaxFieldType[i] = value;
}
else
{
if (MinFieldType[i] > value) MinFieldType[i] = value;
if (MaxFieldType[i] < value) MaxFieldType[i] = value;
}
}
break;
case TOKEN_LINEAR_FLOAT:
if (sscanf(Tokens[i], "%f", &value) == 1)
{
if (init_min_max)
{
MinFieldType[i] = value;
MaxFieldType[i] = value;
}
else
{
if (MinFieldType[i] > value) MinFieldType[i] = value;
if (MaxFieldType[i] < value) MaxFieldType[i] = value;
}
}
break;
case TOKEN_NOMINAL:
break;
default:
break;
}
}
}
init_min_max = false;
}
number_of_lines++;
}
fclose(ifp);
// Cater for 0..1 being a nominal value
for (i=0; i<number_of_tokens; i++)
{
switch (FieldType[i])
{
case TOKEN_UNUSED:
break;
case TOKEN_LINEAR_INTEGER:
if ((MinFieldType[i] == 0) && (MaxFieldType[i] == 1))
{
FieldType[i] = TOKEN_NOMINAL;
}
break;
case TOKEN_LINEAR_FLOAT:
break;
case TOKEN_NOMINAL:
break;
default:
break;
}
}
}
else
{
printf("\n");
printf("\n ABORTING >> Data File %s not found. ", filename);
printf("\n");
}
}
int printFieldType()
{
int number_of_fields = 0;
int i;
printf("\n");
for (i=0; i<MAX_TOKENS; i++)
{
switch (FieldType[i])
{
case TOKEN_UNUSED:
break;
case TOKEN_LINEAR_INTEGER:
printf("Field %d is LINEAR (%d ... %d)\n", i, (long)MinFieldType[i], (long)MaxFieldType[i]);
number_of_fields++;
break;
case TOKEN_LINEAR_FLOAT:
printf("Field %d is LINEAR (%f ... %f)\n", i, MinFieldType[i], MaxFieldType[i]);
number_of_fields++;
break;
case TOKEN_NOMINAL:
printf("Field %d is NOMINAL\n", i);
number_of_fields++;
break;
default:
break;
}
}
printf("\n");
printf("There are %d fields\n", number_of_fields);
printf("\n");
return number_of_fields;
}
int readNumberOfRecords(char* filename, bool titles_present)
{
FILE *ifp;
int number_of_lines = 0;
int number_of_records = 0;
int number_of_tokens;
int i;
float f_value;
long d_value;
bool record_okay;
if ( ( ifp = fopen( filename, "r" ) ) != NULL )
{
printf("\n");
printf("Reading Number Of Records in %s\n", filename);
while (getline(ifp, InputBuffer))
{
if ((!titles_present) || (number_of_lines > 0))
{
number_of_tokens = getTokens(InputBuffer, ',');
if (number_of_tokens > 0)
{
record_okay = true;
for (i=0; i<number_of_tokens; i++)
{
if (strcmp(Tokens[i], "?") == 0) record_okay = false;
strcpy(Records[number_of_records][i].text, Tokens[i]);
switch (FieldType[i])
{
case TOKEN_UNUSED:
break;
case TOKEN_LINEAR_INTEGER:
if (sscanf(Tokens[i], "%d", &d_value) != 1)
{
record_okay = false;
}
break;
case TOKEN_LINEAR_FLOAT:
if (sscanf(Tokens[i], "%f", &f_value) != 1)
{
record_okay = false;
}
break;
case TOKEN_NOMINAL:
break;
default:
break;
}
}
if (record_okay) number_of_records++;
}
}
number_of_lines++;
}
fclose(ifp);
printf("... %d records read\n", number_of_records);
printf("\n");
}
else
{
printf("\n");
printf("\n ABORTING >> Data File %s not found. ", filename);
printf("\n");
}
return number_of_records;
}
int readSampleData(char* filename, bool titles_present)
{
FILE *ifp;
int number_of_lines = 0;
int number_of_records = 0;
int number_of_tokens;
int i;
float f_value;
long d_value;
bool record_okay;
double sample_limit;
sample_limit = (double)SAMPLE_SIZE / (double)total_number_of_records;
if ( ( ifp = fopen( filename, "r" ) ) != NULL )
{
printf("\n");
printf("Reading sample data from %s\n", filename);
while (getline(ifp, InputBuffer))
{
if ((!titles_present) || (number_of_lines > 0))
{
number_of_tokens = getTokens(InputBuffer, ',');
if (number_of_tokens > 0)
{
record_okay = true;
for (i=0; i<number_of_tokens; i++)
{
if (strcmp(Tokens[i], "?") == 0) record_okay = false;
strcpy(Records[number_of_records][i].text, Tokens[i]);
switch (FieldType[i])
{
case TOKEN_UNUSED:
break;
case TOKEN_LINEAR_INTEGER:
if (sscanf(Tokens[i], "%d", &d_value) == 1)
{
f_value = (float)d_value;
Records[number_of_records][i].value = f_value;
}
else
{
record_okay = false;
}
break;
case TOKEN_LINEAR_FLOAT:
if (sscanf(Tokens[i], "%f", &f_value) == 1)
{
Records[number_of_records][i].value = f_value;
}
else
{
record_okay = false;
}
break;
case TOKEN_NOMINAL:
break;
default:
break;
}
}
if (record_okay)
{
if (random() < sample_limit)
{
number_of_records++;
}
}
}
}
number_of_lines++;
}
fclose(ifp);
printf("... %d records read\n", number_of_records);
printf("\n");
}
else
{
printf("\n");
printf("\n ABORTING >> Data File %s not found. ", filename);
printf("\n");
}
return number_of_records;
}
int readData(char* filename, bool titles_present)
{
FILE *ifp;
int number_of_lines = 0;
int number_of_records = 0;
int number_of_tokens;
int i;
float f_value;
long d_value;
bool record_okay;
if ( ( ifp = fopen( filename, "r" ) ) != NULL )
{
printf("\n");
printf("Reading %s\n", filename);
if (number_of_records >= MAX_RECORDS) return MAX_RECORDS; // Safety check
while (getline(ifp, InputBuffer))
{
if ((!titles_present) || (number_of_lines > 0))
{
number_of_tokens = getTokens(InputBuffer, ',');
if (number_of_tokens > 0)
{
record_okay = true;
for (i=0; i<number_of_tokens; i++)
{
if (strcmp(Tokens[i], "?") == 0) record_okay = false;
strcpy(Records[number_of_records][i].text, Tokens[i]);
switch (FieldType[i])
{
case TOKEN_UNUSED:
break;
case TOKEN_LINEAR_INTEGER:
if (sscanf(Tokens[i], "%d", &d_value) == 1)
{
f_value = (float)d_value;
Records[number_of_records][i].value = f_value;
}
else
{
record_okay = false;
}
break;
case TOKEN_LINEAR_FLOAT:
if (sscanf(Tokens[i], "%f", &f_value) == 1)
{
Records[number_of_records][i].value = f_value;
}
else
{
record_okay = false;
}
break;
case TOKEN_NOMINAL:
break;
default:
break;
}
}
if (record_okay) number_of_records++;
}
}
number_of_lines++;
}
fclose(ifp);
printf("... %d records read\n", number_of_records);
printf("\n");
}
else
{
printf("\n");
printf("\n ABORTING >> Data File %s not found. ", filename);
printf("\n");
}
return number_of_records;
}
int map100(float value, float min_value, float max_value)
{
int map_value;
map_value = (int)((100.0 * value) / (max_value - min_value));
// map_value = (int)((100.0 * (value - min_value)) / (max_value - min_value));
return map_value;
}
float dissimilarity_linear(float a, float b, float min_value, float max_value)
{
float diss;
if (a > b)
{
diss = (float)map100((a - b), min_value, max_value);
if (diss < 0.0)
{
//printf("dissimilarity_linear(a > b): a = %f, b = %f, min_value = %f, max_value = %f\n", a, b, min_value, max_value);
return 100;
}
else
{
return diss;
}
}
else
{
diss = (float)map100((b - a), min_value, max_value);
if (diss < 0.0)
{
//printf("dissimilarity_linear(a <= b): a = %f, b = %f, min_value = %f, max_value = %f\n", a, b, min_value, max_value);
return 100;
}
else
{
return diss;
}
}
}
float dissimilarity_catagorical(char* a, char* b)
{
if (strcmp(a,b) == 0)
{
return 0.0;
}
else
{
return CATAGORICAL_CONST;
}
}
int distance(bool* g, int a, int b)
{
int i;
float sum_of_values = 0.0;
float sum_of_mins = 0.0;
float sum_of_maxs = 0.0;
float dist;
for (i=0; i<number_of_features; i++)
{
if (g[i])
{
switch (FieldType[i])
{
case TOKEN_UNUSED:
break;
case TOKEN_LINEAR_INTEGER:
case TOKEN_LINEAR_FLOAT:
//printf("distance: a = %d, b = %d, i = %d\n", a, b, i);
sum_of_values = sum_of_values + dissimilarity_linear(Records[a][i].value, Records[b][i].value, MinFieldType[i], MaxFieldType[i]);
sum_of_maxs = sum_of_maxs + 100.0;
break;
case TOKEN_NOMINAL:
sum_of_values = sum_of_values + dissimilarity_catagorical(Records[a][i].text, Records[b][i].text);
sum_of_maxs = sum_of_maxs + CATAGORICAL_CONST;
break;
default:
break;
}
}
}
if (sum_of_maxs > 0.0)
{
dist = map100(sum_of_values, sum_of_mins, sum_of_maxs);
if (dist >= 0.0)
{
return dist;
}
else
{
//printf("distance: number_of_features = %d\n", number_of_features);
//printf("distance: dist = %d, sum_of_values = %d, sum_of_mins= %d, sum_of_maxs= %d\n", dist, sum_of_values, sum_of_mins, sum_of_maxs);
return 100;
}
}
else
{
return 100;
}
}
void get_freq_100(bool* g)
{
double max;
int i;
int j;
int index;
for (i=0; i<FREQUENCY_100; i++)
{
freq_100[i] = 0;
}
for (i=0; i<number_of_points; i++)
{
for (j=0; j<number_of_points; j++)
{
if (i < j)
{
index = distance(g, i, j);
if ((index >= 0) && (index <= 100))
{
freq_100[index]++;
}
else
{
//printf("freq_100: index = %d\n", index);
}
}
}
}
}
void get_freq_100_ave()
{
int i;
int j;
int index;
int count;
// Intermediate averages
for (i=0; i<FREQUENCY_100; i++)
{
count = 0;
freq_100_int[i] = 0;
for (j=-4; j<=4; j++)
{
index = i + j;
if ((index >= 0) && (index < FREQUENCY_100))
{
freq_100_int[i] = freq_100_int[i] + freq_100[index];
count++;
}
}
freq_100_int[i] = freq_100_int[i] / count;
}
// Full averages
for (i=0; i<FREQUENCY_100; i++)
{
count = 0;
freq_100_ave[i] = 0;
for (j=-4; j<=4; j++)
{
index = i + j;
if ((index >= 0) && (index < FREQUENCY_100))
{
freq_100_ave[i] = freq_100_ave[i] + freq_100_int[index];
count++;
}
}
freq_100_ave[i] = freq_100_ave[i] / count;
}
}
int count_peaks()
{
int i;
number_of_peaks = 0;
for (i=0; i<MAX_NUMBER_OF_PEAKS; i++)
{
peaks[i] = 0;
heights[i] = 0;
}
for (i=2; i<FREQUENCY_100-2; i++)
{
if ((freq_100_ave[i-2] < freq_100_ave[i-1])
&& (freq_100_ave[i-1] <= freq_100_ave[i-0])
&& (freq_100_ave[i+2] <= freq_100_ave[i+1])
&& (freq_100_ave[i+1] < freq_100_ave[i+0]))
{
if (number_of_peaks < MAX_NUMBER_OF_PEAKS)
{
peaks[number_of_peaks] = i;
heights[number_of_peaks] = freq_100_ave[i];
}
number_of_peaks++;
}
}
return number_of_peaks;
}
int count_troughs()
{
int i;
number_of_troughs = 0;
for (i=0; i<MAX_NUMBER_OF_PEAKS; i++)
{
troughs[i] = 0;
depths[i] = 0;
}
for (i=2; i<FREQUENCY_100-3; i++)
{
if (((freq_100_ave[i-2] > freq_100_ave[i-1]) && (freq_100_ave[i-1] >= freq_100_ave[i-0]) && (freq_100_ave[i+2] > freq_100_ave[i+1]) && (freq_100_ave[i+1] >= freq_100_ave[i+0]))
|| ((freq_100_ave[i-2] > freq_100_ave[i-1]) && (freq_100_ave[i-1] >= freq_100_ave[i-0]) && (freq_100_ave[i+3] > freq_100_ave[i+2]) && (freq_100_ave[i+2] >= freq_100_ave[i+1]))
|| ((freq_100_ave[i-2] > freq_100_ave[i-1]) && (freq_100_ave[i-1] >= freq_100_ave[i-0]) && (freq_100_ave[i+4] > freq_100_ave[i+3]) && (freq_100_ave[i+2] >= freq_100_ave[i+1]))
|| ((freq_100_ave[i-1] > 0) && (freq_100_ave[i-0] == 0)))
{
if (number_of_troughs < MAX_NUMBER_OF_PEAKS)
{
if (number_of_troughs > 0)
{
if (troughs[number_of_troughs-1] == (i-1)) number_of_troughs--;
}
troughs[number_of_troughs] = i;
depths[number_of_troughs] = freq_100_ave[i];
}
number_of_troughs++;
}
}
return number_of_troughs;
}
void save_freq_100(char* outfile)
{
FILE *ofp;
int i;
if ( ( ofp = fopen( outfile, "w" ) ) != NULL )
{
fprintf(ofp, "Frequency\n");
for (i=0; i<FREQUENCY_100; i++)
{
fprintf(ofp, "%d\n", freq_100[i]);
}
fclose(ofp);
}
}
void save_freq_100_ave(char* outfile)
{
FILE *ofp;
int i;
if ( ( ofp = fopen( outfile, "w" ) ) != NULL )
{
fprintf(ofp, "AverageFrequency\n");
for (i=0; i<FREQUENCY_100; i++)
{
fprintf(ofp, "%d\n", freq_100_ave[i]);
}
fclose(ofp);
}
}
void clusterData(bool* g, int number_of_records, int clustering_distance)
{
int i;
int j;
int k;
int current_cluster;
int new_cluster;
printf("\n");
printf("Clustering\n");
printf("\n");
for (i=0; i<number_of_records; i++)
{
if (allocated_cluster[i] == -1) // not allocated
{
current_cluster = get_next_cluster();
allocated_cluster[i] = current_cluster;
}
else
{
current_cluster = allocated_cluster[i];
}
for (j=i+1; j<number_of_records; j++)
{
if (distance(g, i, j) < clustering_distance)
{
if (allocated_cluster[j] == -1) // not allocated
{
allocated_cluster[j] = current_cluster;
}
else
{
// merge clusters
new_cluster = allocated_cluster[j];
for (k=0; k<number_of_records; k++)
{
if (allocated_cluster[k] == current_cluster) allocated_cluster[k] = new_cluster;
}
current_cluster = new_cluster;
}
}
}
}
}
void setClusterSizes()
{
int i;
int current_cluster;
for (i=0; i<MAX_RECORDS; i++)
{
current_cluster = allocated_cluster[i];
if ((current_cluster >= 0) && (current_cluster < MAX_RECORDS))
{
cluster_sizes[current_cluster]++;
}
}
}
void reportClusters(FILE* rfp)
{
int i;
for (i=0; i<MAX_RECORDS; i++)
{
if (cluster_sizes[i] > 0)
{
printf("Cluster %d has %d members\n", i, cluster_sizes[i]);
fprintf(rfp, "Cluster %d has %d members\n", i, cluster_sizes[i]);
}
}
}
int fitness(bool* genes)
{
int i;
int size = -1;
for (i=0; i<MAX_TOKENS; i++)
{
if (genes[i])
{
if (FieldType[i] != TOKEN_UNUSED)
{
size++; // add a penalty for having redundant genes being present
}
}
}
printf("\n");
printf("--------------------------\n");
printf("\n");
get_freq_100(genes);
get_freq_100_ave();
number_of_peaks = count_peaks();
printf("Peaks = %d\n", number_of_peaks);
for (i=0; i<MAX_NUMBER_OF_PEAKS; i++)
{
if (peaks[i] != 0) printf("%d (%d) ", peaks[i], heights[i]);
}
printf("\n");
printf("\n");
number_of_troughs = count_troughs();
printf("Troughs = %d\n", number_of_troughs);
for (i=0; i<MAX_NUMBER_OF_PEAKS; i++)
{
if (troughs[i] != 0) printf("%d (%d) ", troughs[i], depths[i]);
}
printf("\n");
printf("\n");
// if (number_of_troughs >= 1)
// if ((number_of_peaks >= 1) && (number_of_troughs >= 1) && (number_of_peaks < 4) && (number_of_troughs < 3))
if ((number_of_peaks > 1) && (number_of_troughs >= 1))
{
if (troughs[0] >= 3)
{
//printf("Fitness = %d\n\n", troughs[0]);
//return troughs[0];
printf("Fitness = %d\n\n", troughs[0] + size);
return troughs[0] + size;
//printf("Fitness = %d\n\n", depths[0] + size);
//return depths[0] + size;
}
else
{
printf("Fitness = 1000000\n\n");
return 1000000;
}
}
else
{
printf("Fitness = 1000000\n\n");
return 1000000;
}
}
void initPopulation()
{
int i;
int j;
for (i=0; i<MAX_POPULATION; i++)
{
population[i].fitness = 1000;
for (j=0; j<MAX_TOKENS; j++)
{
if (g[j])
{
if (random() < 0.5)
{
population[i].genes[j] = true;
}
else
{
population[i].genes[j] = false;
}
}
else
{
population[i].genes[j] = false;
}
}
population[i].fitness = fitness(population[i].genes);
}
}
int select()
{
int i = MAX_POPULATION;
while (i >= MAX_POPULATION)
{
i = (int)(random() * MAX_POPULATION);
}
return i;
}
void crossover(int a, int b)
{
int i;
for (i=0; i<MAX_TOKENS; i++)
{
if (random() < 0.5)
{
child.genes[i] = population[a].genes[i];
}
else
{
child.genes[i] = population[b].genes[i];
}
}
}
void mutation(int number_of_fields)
{
int index;
index = (int)(random() * number_of_fields);
if (random() < 0.5)
{
if (g[index])
{
child.genes[index] = !child.genes[index];
}
else
{
child.genes[index] = false;
}
}
}
void replace()
{
int i;
int j;
child.fitness = fitness(child.genes);
i = select();
//if (i >= MAX_POPULATION) i = 0;
if (child.fitness < population[i].fitness) // replace population member with fitter child
{
for (j=0; j<MAX_TOKENS; j++)
{
population[i].genes[j] = child.genes[j];
}
population[i].fitness = child.fitness;
}
}
int getBest()
{
int i;
int best = 0;
int best_value = 1000;
for (i=0; i<MAX_POPULATION; i++)
{
if (population[i].fitness < best_value)
{
best = i;
best_value = population[i].fitness;
}
}
return best;
}
void saveClusters(char* file, int number_of_records)
{
char cstfile[MAX_STRING_SIZE];
FILE* cfp;
int i;
int j;
int k;
for (i=0; i<number_of_records; i++)
{
if (cluster_sizes[i] > 0)
{
sprintf(cstfile, "%s_%d.txt", file, i);
if ( ( cfp = fopen( cstfile, "w" ) ) != NULL )
{
for (j=0; j<number_of_records; j++)
{
if (allocated_cluster[j] == i)
{
for (k=0; k<MAX_TOKENS; k++)
{
if (FieldType[k] != TOKEN_UNUSED)
{
if (k == 0)
{
fprintf(cfp, "%s", Records[j][k].text);
}
else
{
fprintf(cfp, ",%s", Records[j][k].text);
}
}
}
fprintf(cfp, "\n");
}
}
fclose(cfp);
}
else
{
printf("\n");
printf("\n ABORTING >> Output File %s not openned. ", cstfile);
printf("\n");
}
}
}
}
char attribs[20][MAX_STRING_SIZE];
int next_attribute;
void addAttribute(char* attribute)
{
int i;
bool attribute_found = false;
for (i=0; i<next_attribute; i++)
{
if (strcmp(attribs[i], attribute) == 0) attribute_found = true;
}
if (!attribute_found)
{
strcpy(attribs[next_attribute], attribute);
next_attribute++;
}
}
void analyseClusters(FILE* rfp, bool* g)
{
int i;
int j;
int k;
bool attribute_found;
float min;
float max;
int count;
for (i=0; i<number_of_points; i++)
{
if (cluster_sizes[i] > 2)
{
fprintf(rfp, "Cluster %d has %d members\n", i, cluster_sizes[i]);
for (j=0; j<MAX_TOKENS; j++)
{
next_attribute = 0;
if (g[j])
{
switch (FieldType[j])
{
case TOKEN_UNUSED:
break;
case TOKEN_LINEAR_INTEGER:
case TOKEN_LINEAR_FLOAT:
max = MinFieldType[j];
min = MaxFieldType[j];
for (k=0; k<number_of_points; k++)
{
if (i == allocated_cluster[k])
{
if (max < Records[k][j].value) max = Records[k][j].value;
if (min > Records[k][j].value) min = Records[k][j].value;
}
}
fprintf(rfp, "feature = %d, min = %f, max = %f\n\n", j, min, max);
break;
case TOKEN_NOMINAL:
next_attribute = 0;
count = 0;
for (k=0; k<number_of_points; k++)
{
if (i == allocated_cluster[k])
{
addAttribute(Records[k][j].text);
}
}
fprintf(rfp, "feature = %d, ", j);
for (k=0; k<next_attribute; k++)
{
fprintf(rfp, "%s, ", attribs[k]);
}
fprintf(rfp, "\n\n");
break;
default:
break;
}
}
}
}
}
}
void reduceOutliers(bool* g, int merge_distance)
{
int i;
int j;
int cluster_i;
int cluster_j;
int nearest_cluster;
int cluster_distance;
int nearest_distance;
int count = 0;
int outliers = 0;
for (i=0; i<number_of_points; i++)
{
nearest_distance = 100;
cluster_i = allocated_cluster[i];
if (cluster_sizes[cluster_i] <= 3)
{
outliers++;
// Can we merge it to a nearby cluster?
for (j=0; j<number_of_points; j++)
{
if (i != j)
{
cluster_j = allocated_cluster[j];
if (cluster_sizes[cluster_j] > 3)
{
cluster_distance = distance(g, i, j);
//printf("cluster_distance = %d\n", cluster_distance);
if (cluster_distance < nearest_distance)
{
nearest_distance = cluster_distance;
nearest_cluster = cluster_j;
}
}
}
}
if (nearest_distance < merge_distance)
{
allocated_cluster[i] = nearest_cluster;
cluster_sizes[cluster_i]--;
cluster_sizes[nearest_cluster]++;
count++;
}
}
}
//printf("%d outliers merged = %d\n", outliers, count);
//getchar();
}
void displayGene(bool* g, int number_of_features)
{
int i;
printf("Genes = ");
for (i=0; i<number_of_features; i++)
{
if (g[i])
{
printf("T ");
}
else
{
printf("F ");
}
}
printf("\n\n");
}
void clusterFile(char* file, int ignore)
{
int number_of_records;
int number_of_fields;
int number_of_peaks;
int number_of_troughs;
int clustering_distance;
int i;
int best;
char infile [MAX_STRING_SIZE];
char rptfile[MAX_STRING_SIZE];
FILE* rfp;
next_cluster = -1;
init_g();
if ((ignore >= 0) && (ignore < MAX_TOKENS))
{
g[ignore] = false;
}
sprintf(infile, "%s.txt", file);
sprintf(rptfile, "%s_report.txt", file);
if ( ( rfp = fopen( rptfile, "w" ) ) != NULL )
{
fprintf(rfp, "Clustering '%s'\n\n", infile);
initFieldType();
setFieldType(infile, false);
setMinMax(infile, false);
total_number_of_records = readNumberOfRecords(infile, false);
number_of_records = readSampleData(infile, false);
// number_of_records = readData(infile, false);
number_of_points = number_of_records;
fprintf(rfp, "%d records read\n", number_of_records);
number_of_fields = printFieldType();
number_of_features = number_of_fields;
fprintf(rfp, "Each record contains %d fields\n", number_of_fields);
fprintf(rfp, "\n");
//getchar();
initPopulation();
for (i=0; i<50000; i++)
{
crossover(select(), select());
mutation(number_of_fields);
replace();
}
printf("\n");
printf("--------------------------\n");
printf("\n");
best = getBest();
printf("\n");
printf("Best member of the population = %d\n", best);
printf("\n");
displayGene(population[best].genes, number_of_fields);
//getchar();
printf("\n");
printf("--------------------------\n");
printf("\n");
get_freq_100(population[best].genes);
get_freq_100_ave();
save_freq_100("freq100.csv");
save_freq_100_ave("freq100_ave.csv");
number_of_peaks = count_peaks();
printf("Peaks = %d\n", number_of_peaks);
for (i=0; i<MAX_NUMBER_OF_PEAKS; i++)
{
if (peaks[i] != 0) printf("%d (%d) ", peaks[i], heights[i]);
}
printf("\n");
printf("\n");
number_of_troughs = count_troughs();
printf("Troughs = %d\n", number_of_troughs);
for (i=0; i<MAX_NUMBER_OF_PEAKS; i++)
{
if (troughs[i] != 0) printf("%d (%d) ", troughs[i], depths[i]);
}
printf("\n\n");
if ((number_of_peaks >= 1) && (number_of_troughs >= 1))
{
clustering_distance = troughs[0] /2;
if (clustering_distance < 3) clustering_distance = 3;
printf("\n");
printf("Clustering has been found\n");
fprintf(rfp, "Clustering has been found\n");
printf("The clustering distance used to identify clusters = %d\n", clustering_distance);
fprintf(rfp, "The clustering distance used to identify clusters = %d\n", clustering_distance);
for (i=0; i<number_of_fields; i++)
{
if (population[best].genes[i])
{
printf("Clustering using field %d\n", i);
fprintf(rfp, "Clustering using field %d\n", i);
}
}
initAllocatedCluster();
initClusterSizes();
// Need to read in the real data!!!!
number_of_records = readData(infile, false);
number_of_points = number_of_records;
clusterData(population[best].genes, number_of_records, clustering_distance);
setClusterSizes();
reduceOutliers(population[best].genes, 10);
reportClusters(rfp);
analyseClusters(rfp, population[best].genes);
saveClusters(file, number_of_records);
printf("\n");
fprintf(rfp, "\n");
}
else
{
printf("\n");
fprintf(rfp, "\n");
printf("Clustering has NOT been found\n");
fprintf(rfp, "Clustering has NOT been found\n");
printf("\n");
}
fclose(rfp);
}
}
int _tmain(int argc, _TCHAR* argv[])
{
random.seed(9);
//clusterFile("Iris", 4);
//clusterFile("Iris_0", 4);
//clusterFile("Iris_3", 4);
//clusterFile("Breast Cancer Wisconsin", 10);
//clusterFile("Dermatology", 34);
//clusterFile("Dermatology_0", 34);
//clusterFile("Dermatology_5", 34);
//clusterFile("Dermatology_7", 34);
//clusterFile("Dermatology_13", 34);
//clusterFile("Seeds", 7);
//clusterFile("Seeds_0", 7);
//clusterFile("Seeds_2", 7);
//clusterFile("Seeds_3", 7);
clusterFile("Mushroom", 0);
//clusterFile("Hepatitis Domain", 0);
printf("\n");
printf("Press any Key to exit> ");
getchar();
printf("\n");
return 0;
}
/*
init_g();
//g[4] = false;
initFieldType();
//setFieldType("Iris.txt", false);
//setMinMax("Iris.txt", false);
//number_of_records = readData("Iris.txt", false);
setFieldType("Mushroom.txt", true);
setMinMax("Mushroom.txt", true);
number_of_records = readData("Mushroom.txt", true);
number_of_points = number_of_records;
number_of_fields = printFieldType();
initPopulation();
for (i=0; i<200; i++)
{
crossover(select(), select());
mutation(number_of_fields);
replace();
}
printf("\n");
printf("--------------------------\n");
printf("\n");
best = getBest();
printf("\n");
printf("Best member of the population = %d\n", best);
printf("\n");
printf("\n");
printf("--------------------------\n");
printf("\n");
get_freq_100(population[best].genes);
get_freq_100_ave();
printf("Peaks=%d\n", count_peaks());
for (i=0; i<MAX_NUMBER_OF_PEAKS; i++)
{
if (peaks[i] != 0) printf("%d (%d) ", peaks[i], heights[i]);
}
printf("\n");
number_of_peaks = count_peaks();
if (2 == number_of_peaks)
{
printf("\n");
//printf("Initial clustering distance = %f\n", (double)peaks[0] * max_distance() / (2.0 * 100.0));
}
number_of_troughs = count_troughs();
printf("Troughs=%d\n", number_of_troughs);
for (i=0; i<MAX_NUMBER_OF_PEAKS; i++)
{
if (troughs[i] != 0) printf("%d (%d) ", troughs[i], depths[i]);
}
printf("\n\n");
//save_freq_100("freq100.csv");
//save_freq_100_ave("freq100_ave.csv");
if ((number_of_peaks >= 1) && (number_of_troughs >= 1))
{
clustering_distance = troughs[0] / 2;
printf("\n");
printf("Clustering has been found\n");
printf("The clustering distance used to identify clusters = %d\n", clustering_distance);
for (i=0; i<number_of_fields; i++)
{
if (population[best].genes[i])
{
printf("Clustering using field %d\n", i);
}
}
initAllocatedCluster();
initClusterSizes();
clusterData(population[best].genes, number_of_records, clustering_distance);
reportClusters();
printf("\n");
}
else
{
printf("\n");
printf("Clustering has NOT been found\n");
printf("\n");
}
*/