I am writing a program to divide my entire database (which comprises of words(A-Z) and phonemic definitions of the corresponding word on the same line) into small text files that become manageable if they are to be read by an 8-bit microcontrolling unit. The database is sorted.
Right now at this stage I am only counting the characters in the smaller sub-divisions of the database. The strategy that I have used to divide my database is first I read the first character of the database and identify the character read after that I read the second character and group them or classify them into one array entry i.e. my array contains the number of characters in AA, AB,AC....BA,BB,BC........ZA,ZY,ZZ, a total of 676 values (26*26).
The problem I am facing is that the output is just not right and I have verified it using a Hex editor.
I am attaching the a ver small sample of the database. The entire database can be found here: https://cmusphinx.svn.sourceforge.net/svnroot/cmusphinx/trunk/cmudict/
#include<stdio.h>
#include<stdlib.h>
int main(){
FILE *fp;
fp=fopen("database2.txt","r"); //Error checking
if(fp==NULL)
{printf("Error opening file!");
exit(1);
}
int arr[676], ch='A', ch1='A', count=0, i=0, j=0, k=0, l=0; /* array of 26*26 entries for AA..ZZ, ch and ch1 for
reading the first 2 characters of the file,
count variable to count
the characters and other variable that serve as indices
for looping.*/
char a='A', a1='A'; // 2 char variables for comparing the first 2 characters of
// any word
for(i;i<676;i++) //initializing array elements to 0...
arr[i]=0;
i=0;
while((ch=fgetc(fp))!=EOF&&a<='Z') /*read first character & check for EOF,
loop until a is within the A-Z boundary
*/
{
if(ch==a) //is 1st character=a?
{
ch1=fgetc(fp); //read 2nd character if 1st comparison is passed.
if(ch1==a1)
{
count+=2; //increment count to 2 because we have already read 2 chars
while(ch1!='\n'&&ch1!=EOF) //increment until new line is not reached, store in array
{ch1=fgetc(fp);
count++;
arr[i]=count;
}
}
else //ch1!=a1
{
if(ch1>='A'&&ch1<='Z') //enter block only if ch1 is a valid alphabet
{
if(ch1>a1) // ch1 is greater than a1 because the database is sorted
{
for(j=0;j<(ch1-a1);j++) //ch1 is subtracted from a1 to
i++; //increment the index to store in appropriate position.
count=0;
count+=2; //make count=0 as we have read a word belonging to a new entry
a1=ch1; //set the variable a1 such that on the next iteration control passes to the if(ch1==a1) block
while(ch1!='\n'&&ch1!=EOF) //as we have read 2 chars of a new word, count upto /n and store
{ch1=fgetc(fp);
count++;
arr[i]=count;
}
}
}
else //if in the database we encounter A..Z and some special character then we skip eg A'.. or A. etc
{
count=0;
while(ch1!='\n'&&ch1!=EOF)
ch1=fgetc(fp);
}
}
}
else //if the 1st character of the word is not equal to the current variable a, then we increment a
{ // database contains entries ranging from A to Z, no alphabet is missed out,
// hence we can increment a to point to next character
++a;
count=0; //initialize count to 0 as we are going to fill a new entry in array.
ch1=fgetc(fp); //get 2nd character of the word
if(ch1>='A'&&ch1<='Z') //2nd char should be between A-Z.//
{
if(ch1>a1||ch1==a1) // the 2nd char is greater than the current variable a1 or ch==a1
{for(k=0;k<(26+(ch1-a1));k++) //wrap around! eg if previous 2 chars were AB... and the current chars are BC... or BB...
i++; //then increment the pointer to the appropriate position
a1=ch1;
count+=2;
while(ch1!='\n'&&ch1!=EOF) //read the first 2 chars, read the entire word...
{ch1=fgetc(fp);
count++;
arr[i]=count;
}
}
else { //if 2nd char is less than variable a1 eg. previous words' starting letters are AY
for(l=0;l<(('Z'-a1)+(ch1-'A')+1);l++) //and current words' starting letters are BC
i++;
a1=ch1;
count+=2;
while(ch1!='\n'&&ch1!=EOF) //read the first 2 chars, read the entire word...
{ch1=fgetc(fp);
count++;
arr[i]=count;
}
}
}
else
{
while(ch1!='\n'&&ch1!=EOF) //valid 1st char and invalid 2nd char, skip...
ch1=fgetc(fp);
}
}
}
for(j=0;j<676;j++) //print
{
printf("%d ",arr[j]);
if(j%10==0)
printf("\n");
}
fclose(fp);
return 0;
}