Hi all... I have been trying to read a huge file of size around 7 GB.
It looks like impossible task now..
The file looks like this:
CITY0_084989053<tab>hdewhiuewf
CITY1_000989090<tab>AACGTACGT
CITY1_000989090<tab>GTACGATAH
CITY2_643274032<tab>kdijadadsail
CITY3_004498906<tab>Adjbsajdada
CITY3_004498906<tab>Adjbsajdada
......
I expect an output like below with only duplicate left side records.
CITY1_000989090#1<tab>AACGTACGT
CITY1_000989090#2<tab>GTACGATAH
CITY3_004498906#1<tab>Adjbsajdada
CITY3_004498906#2<tab>Adjbsajdada
......
The steps I follow is like this:
1) First I awk the left side ID's into a file and then find the uniq of those and create a file of uniq headers.
2) Read this file and compare the duplicate headers and assign #1 / #2...
Here is my initial code
#include<iostream>
#include<string.h>
#include<fstream>
#include<stdlib.h>
#include<set>
#include<map>
using namespace std;
inline int GetIntVal(string strConvert)
{
int intReturn;
intReturn = atoi(strConvert.c_str());
return(intReturn);
}
int main(int argc ,char* argv[])
{
set<string> myset;
set<string>::iterator it;
string L1_file = argv[1];
const char* inputfile1 = L1_file.c_str();
FILE *in;
char line[3000];
char *token;
in = fopen(inputfile1,"rt+");
if( in == NULL) exit(1);
string llocations;
string chr_base;
while (! feof(in)) {
fgets(line,3000,in);
if (! feof(in)) {
int count =0;
//token = strtok(line, "\t\n");
//cout << line;
myset.insert(line);
}
} // End of 1st File Reading While loop
////////////// Reading second file
string start, end, chr, A1, A2,A3,A4,A5,A6,A7;
int position;
string feature_file = argv[2];
const char* inputfile2 = feature_file.c_str();
FILE *inn;
char linee[3000];
char *tokenn;
inn = fopen(inputfile2,"rt+");
if( inn == NULL) exit(1);
int count = 1;
while (! feof(inn)) {
fgets(linee,3000,inn);
if (! feof(inn)) {
int count2 =0;
tokenn = strtok(linee, "\t\n");
while (tokenn != NULL) {
if (++count2) {
if (count2==1) {
chr.assign(tokenn);
//cout << tokenn << endl;
}
if (count2==2) {
start.assign(tokenn);
//cout << tokenn << endl;
}
}
tokenn = strtok(NULL, "\t\n");
}
string chr2 = chr + "\n";
for (it=myset.begin(); it!=myset.end(); it++){
//cout << *it;
//cout << start << "\t" << end << "\t" << position << endl;
if (chr2.compare(*it)==0) {
//cout << *it << endl;
cout << chr << "/" << "#" << count << "\t" << start << endl;
count++;
if(count ==3){
count =1;
}
}
}
}
} // End of 1st File Reading While loop
return 0;
}
I have never used seek function .. Maybe that can solve the problem but I have no idea howto use it in this scenario.
Thanks in adavance !