Hello All,
I am here with not really a problem but a bit of guidance in getting along in my path. Here I go with my task, I am trying to align a set of small sequences(may be around 5000 in number and less than 27 char in length) for example, ATGCATGCATGCATGCAAACCCCGG this may be the length of the sequences. And I have to align this set of sequences to a large sequence ( which may be 4KB-10kb in size). And preferably these two sequences are in fasta format.
example,
>exempl.id.1
ATGCATGCATGCATGCAAACCCCGG
>exemple.id.2
ATGCATGCATGCATGCAAACCCC like this.
I have attached the code below
#!/usr/bin/perl -w
use strict;
use Bio::SeqIO;
my $seqio_srna = Bio::SeqIO->new(- file=>'/home/jayakuma/script/graphics/model.fasta',
-format=>'fasta') or die "cannot open file\n";
my $seqio_large = Bio::SeqIO->new(-file => '/home/jayakuma/script/graphics/TCV-jagger.fna',
-format =>'fasta') or die " cannot opne the file\n";
my %genome;
my %srna;
while (my $large_seq = $seqio_large->next_seq()){
my $id = $large_seq->display_id;
my $seq = $large_seq->seq;
$genome{$id} = $seq;
}
while (my $seq_srna = $seqio_srna->next_seq()){
my $display_id = $seq_srna->display_id;
my $seq = $seq_srna->seq;
$srna{$display_id} = $seq;
}
foreach my $lar_seq( keys %genome){
my $genome_seq= $genome{$lar_seq} ;
foreach my $smal_seq(keys %srna){
my $srna_seq = $srna{$smal_seq};
print $srna_seq, "\n";
if ($genome_seq =~ $srna_seq){
my $align = lc$1;
$genome_seq =~ s/$1/$align/;
print $genome_seq, "\n";
}
}
}
In this I am just trying to get the exact matches. But there is some thing wrong and this results in producing some unintialised values in the line 31 which is :$genome_seq =~ s/$1/$align/;.
And I need them to be like
say ,
ATGCATGCATGCATGCAAACCCCGG
ATGC_T___A
Also I tried doing it with blast
#!/usr/bin/perl -w
use strict;
use Bio::SeqIO;
use Bio::Tools::Run::StandAloneBlast;
my %srna;
my %genome;
my $seqio_srna = Bio::SeqIO->new(-file=>'/home/jayakuma/script/graphics/model2.fasta',
-format=>'fasta') or die "cannot open file\n";
my $seqio_large = Bio::SeqIO->new(-file => '/home/jayakuma/script/graphics/TCV-jagger.fna',
-format =>'fasta') or die " cannot opne the file\n";
#while (my $large_seq = $seqio_large->next_seq()){
# my $id = $large_seq->display_id;
#my $seq = $large_seq->seq;
#$genome{$id} = $seq;
#}
while (my $seq_srna = $seqio_srna->next_seq()){
my $display_id = $seq_srna->display_id;
my $seq = $seq_srna->seq;
$srna{$display_id} = $seq;
}
foreach my $seq_id (keys %srna){
my $srnas = $srna{$seq_id};
#print "srnas: $srnas\n";
foreach($srnas){
my $blast = Bio::Tools::Run::StandAloneBlast->new(program=>'blastn',database=>'/home/jayakuma/script/graphics/TCV-jagger.fna');
my $input = Bio::Seq->new(-seq=> $srnas);
print $input->seq, "\n";
my $blast_report = $blast->blastall($input);
while (my $result = $blast_report->next_result()){
my $query_length = $result->query_length();
while (my $hit = $result->next_hit()){
my $id = $hit->accession();
while (my $hsp = $hit->next_hsp()){
if($hsp->frac_identical ==1 && $hsp ->length ==$query_length){
print "$srnas\t$id\n";
}
}
}
}
}
}
I am getting an error message with this as well:
CAGCGATGGGGATCAAGCTC
Use of uninitialized value in pattern match (m//) at /usr/share/perl5/Bio/SeqIO/fasta.pm line 193, <GEN0> line 2.
Use of uninitialized value in print at /usr/share/perl5/Bio/Root/IO.pm line 407, <GEN0> line 2.
-------------------- WARNING ---------------------
MSG: cannot find path to blastall
---------------------------------------------------
Can't call method "next_result" on an undefined value at aligning.pl line 35, <GEN0> line 2.
I have reduced the file size in to just a single sequence thinking that there may be some thing wrong when parsing the id in fasta files. But every time I change the number of sequences it results in the same error message but reporting with different lines each time
any help with any of these code will be really helpful,
Thanks for all your time