Okay, I'm making a program that puts six word sequences into a hash table by taking the ascii value of each letter of the word and adding it to the sum, and multiplying it by a value, such as;
num = num + int(firstName[nq])*(nq+1);
in a loop.
Now, I have a hash table the exact size of the number of total words.
In theory, every six word sequence should have a unique number and be put into the hash table likewise, while those six word sequences that are the same should be put into the same spot of the hash table.
However, when I run my program, I get way too many collisions. It will say I've got 643 of the same six-word sequences between two different files that don't even have a single six word sequence that is the same.
Can anyone POSSIBLY help? Here is my code:
int getdir (string dir, vector<string> &files)
{
DIR *dp;
struct dirent *dirp;
if((dp = opendir(dir.c_str())) == NULL) {
cout << "Error(" << errno << ") opening " << dir << endl;
return errno;
}
while ((dirp = readdir(dp)) != NULL) {
files.push_back(string(dirp->d_name));
}
closedir(dp);
return 0;
}
int main(int argc, char *argv[])
{
List_3358<int> *newList=new List_3358<int>[33971];
long long int num;
num = 0;
int sizey;
int county = 0;
int abc=0;
int mastercount=0;
string rand; // used to convert argv into a string
for(int i = 1; i < argc-1; i++)
{
rand = rand+argv[i];
}
char tempo = *argv[argc-1]; // this and the line below stores the last parameter(the number) into an int
int n = atoi(&tempo);
string test; // used for running the program
string dir = string(rand);
vector<string> files = vector<string>(); // a vector created
getdir(dir,files); //runs the getdir function to get the file names
for (unsigned int i = 0;i < files.size();i++) {
cout << files[i] << endl;
} //prints the file names (good for testing)
ifstream fin[files.size()];//makes an array of fins
ofstream fout;
fout.open("test.txt");//opens the text output file
const int SIZE = n;
string str1[SIZE];
string sum;
//master loop, runs the main loop until there are no more files to read
for(int z =2; z<files.size();z++){
test=rand;
test = test+files[z];
cout << endl << test.data() << endl;
fin[z].open(test.data());
char firstName[100];
num = 0;
//sum = "";
for(int i=0; i<SIZE; i++) //gets the original SIZE words into the string array.
{
fin[z] >> str1[i];
mastercount++;
//sum = sum + str1[i];
strcpy(firstName, str1[i].c_str());
sizey=strlen(firstName);
//cout << endl << " " << firstName << " ";
for(int q = 0; q<sizey; q++)
{
num = num + (int(firstName[q])*(q+1));
//cout << num << endl;
}
}
newList[num%33971].insert(z);
fout << " " << num << " ";
fout << endl << "| ";
for(int p=0; p<SIZE; p++)
fout << str1[p] << " ";
fout << " |";
while(!fin[z].eof()) // this loop gets six words, one word at a time, until the file is empty.
{
num = 0;
for(int q = 0; q<SIZE-1; q++)
{
str1[q]=str1[q+1];
}
fin[z] >> str1[SIZE-1];
mastercount ++;
for(int np=0; np<SIZE; np++) //gets the original SIZE words into the string array.
{
strcpy(firstName, str1[np].c_str());
sizey=strlen(firstName);
//cout << sizey << " " << firstName;
//cout << endl << firstName << endl;
for(int nq = 0; nq<sizey; nq++)
{
num = num + int(firstName[nq])*(nq+1);
//cout << num << endl;
}
}
//cout << endl;
newList[num%33971].insert(z);
fout << " " << num << " ";
fout << endl << "| ";
for(int r=0; r<SIZE; r++)
fout << str1[r] << " ";
fout << " |";
}
fout << endl << endl << endl << endl << endl << endl << endl << endl << endl << endl << endl << endl << endl << endl << endl;
fin[z].close();
}
fout.close();
cout << 33946%num << endl;
cout << sum << endl;
newList[13699%33971].getCurrent();
for (int rh=2;rh<files.size(); rh++){
for (int rq=rh;rq<files.size(); rq++){ county =0;
for(int x = 0; x < 33971; x++)
{
if(!newList[x].isEmpty())
{
if (newList[x].testy(rh,rq))
{
county++;
}
}
}
if (county >= 300)
cout << "Number of collisions between file " << rh << " and " << rq << ": " << county << endl;
}
}
cout << " " << mastercount;
system("PAUSE");
return 0;
}