help with b-tree using two files and hash/key lookoop

Question

bio-grad 0 Junior Poster in Training

12 Years Ago

Hi,

So I am modifying some code from discussion thread: http://www.daniweb.com/software-development/perl/threads/323871

I am still very much a newbie, but am trying to get a better handle on hashes - so if I've messed mine up please tell me ;).

I am attaching 3 files. One is an image of the scenario I am evaluating and the other two are the data files to be use by this code.

While looking at the image - we have two sets of data -
1) gene id [c0], scaffold location [c1], start [c2], stop [c3]and other info [c4]
2) hairpin ids [h0], scaffold locations [h1], start [h2], stop [h3],other info [h4], and sequence read [h15].

Note: c2 may be > c3 or < c3; likewise h2 may be > h3 or < h3 as these sequences have directionality and can read right to left or left to right.

I want to:
find every occurrence where h0 eq c0 (ie. scaffold1000 = scaffold1000) with the hairpin scaffold being the $key

when the hairpin scaffold $key matches any gene scaffold location [c1], I need it to test for conditions (ie. if h2<=c2 && h3>c2 etc.). However, there will be numerous genes with a matching scaffold for a single hairpin and I only want the output for the gene located closest to the hairpin start and stop, hence the need for a b-tree.

I'm thinking I should run the b-tree to locate the closes gene before testing the conditions, but I'm not sure. If there is another way to run this I'm open to suggestions.

Thanks everyone!

#!/usr/bin/perl;
#comparepremiRNAtocoding-pulloverlap3.pl;
use strict;
use warnings;
use FileHandle;
use Data::Dumper;


open(H,"<$ARGV[0]")||die"open h failed";
open(C,"<$ARGV[1]")||die"open c failed";

my %h_hash=();
while(<H>)  # first load h into hash of arrays
{
  chomp;
  my $key=(split/\t/)[1];   # split by tab and make column 2 (index 1) the key (i.e. the scaffolds);
  push @{$h_hash{$key}},[split(/\t/)];
}
close H;
while(<C>) # then sequentially read this file and do the compares on the fly
{
  chomp;
  my @c=split/\t/;    #  split by tab
  foreach my $i ( 0 .. $#{ $h_hash{$c[1]} } ) # for each array of this type (scaffold)
	{  
	   if (  (($h_hash{$c[1]}[$i][2] <= $c[2] #outside hits
		&& $h_hash{$c[1]}[$i][3] <= $c[2] 
		&& $h_hash{$c[1]}[$i][2] <= $c[3] 
		&& $h_hash{$c[1]}[$i][3] <= $c[3])) 
	      || (($h_hash{$c[1]}[$i][2] >= $c[2]
		&& $h_hash{$c[1]}[$i][3] >= $c[2]  
		&& $h_hash{$c[1]}[$i][2] >= $c[3] 
		&& $h_hash{$c[1]}[$i][3] >= $c[3]))  )
    		{ 
# 		need a B-tree for every $key[1] (aka scaffold) so that for every hairpin array I can which coding "region" is nearest- calculate distance between $h_hash{$c[1]}[$i][2] and $h_hash{$c[1]}[$i][3] to $c[2] and $c[3] foreach $i ( 0 .. $#{ $h_hash{$c[1]} } ) (i.e. for each $c[2] and $c[3] where $c[1] eq $key[1]) 
# such as: 
#$h_hash{$c[1]}[$i][2] - $c[2] = |$a|
#$h_hash{$c[1]}[$i][2] - $c[3] = |$b|
#$h_hash{$c[1]}[$i][3] - $c[2] = |$d|
#$h_hash{$c[1]}[$i][3] - $c[3] = |$e|
# for all $c[2] and $c[3] where $key[1] == $c[1]
# sort ($a <==> $b) and select the smallest value - return scalar to be used in print output
				
			
		   {print "$h_hash{$c[1]}[$i][0]."\t".$h_hast{$c[1]}[$i][4]."\t outside"."\t".$c[0]."\t".$c[1]."\t".$h_hash{$c[1]}[$i][2]."\t".$h_hash{$c[1]}[$i][3]."\t".$h_hash{$c[1]}[$i][15]."\n"";}
		}

	 else if      (  (($h_hash{$c[1]}[$i][2] <= $c[2] #inside hits:
			&& $h_hash{$c[1]}[$i][3] <= $c[2] 
			&& $h_hash{$c[1]}[$i][2] >= $c[3] 
			&& $h_hash{$c[1]}[$i][3] >= $c[3])) 
		      || (($h_hash{$c[1]}[$i][2] >= $c[2]
			&& $h_hash{$c[1]}[$i][3] >= $c[2]  
			&& $h_hash{$c[1]}[$i][2] <= $c[3] 
			&& $h_hash{$c[1]}[$i][3] <= $c[3]))  )
               	{ print "$h_hash{$c[1]}[$i][0]."\t".$h_hast{$c[1]}[$i][4]."\t inside"."\t".$c[0]."\t".$c[1]."\t".$h_hash{$c[1]}[$i][2]."\t".$h_hash{$c[1]}[$i][3]."\t".$h_hash{$c[1]}[$i][15]."\n""}

	else  (
		{ print "$h_hash{$c[1]}[$i][0]."\t".$h_hast{$c[1]}[$i][4]."\t overlap"."\t".$c[0]."\t".$c[1]."\t".$h_hash{$c[1]}[$i][2]."\t".$h_hash{$c[1]}[$i][3]."\t".$h_hash{$c[1]}[$i][15]."\n"";}
  }
}
close C;
exit 0;

b-tree hashes perl

dani-sampleGenedata.txt (0.94 KB)

Code1	scaffold3	5	20	gene
Code1.t	scaffold3	5	20	transcript
transcript_id"Code1";	scaffold3	5	5	tss
transcript_id"Code1";	scaffold3	5	7	5'-UTR
transcript_id"Code1";	scaffold3	6	7	start_codon
transcript_id"Code1";	scaffold3	8	19	CDS
transcript_id"Code1";	scaffold3	20	28	internal
transcript_id"Code1";	scaffold3	29	31	intron
transcript_id"Code1";	scaffold3	32	35	CDS
transcript_id"Code1";	scaffold3	38	40	3'-UTR
Code2	scaffold3	65	45	gene
Code2.t	scaffold3	65	45	transcript
transcript_id"Code2";	scaffold3	65	65	tss
transcript_id"Code2";	scaffold3	65	63	5'-UTR
transcript_id"Code2";	scaffold3	64	63	start_codon
transcript_id"Code1";	scaffold3	62	60	internal
transcript_id"Code1";	scaffold3	59	54	intron
transcript_id"Code1";	scaffold3	52	50	CDS
transcript_id"Code1";	scaffold3	48	45	3'-UTR
Code3 	scaffold7	34	80	gene
Code3.t	scaffold7	34	80	transcript
transcript_id"Code3";	scaffold7	34	34	tss
JG289050	scaffold14	4188	3979	
JG289050	scaffold8	207017	206789

dani-sampleHairpindata.txt (1.06 KB)

A1	scaffold3	1	7	csi-60											GCUCCCAAAUUUACGAAACGUUCCGGGG
A2	scaffold3	7	1	csi-60											GCUCCCAAAUUUACGAAACGUUCCGGGG
B1	scaffold3	10	15	csi-60											GCUCCCAAAUUUACGAAACGUUCCGGGG
B2	scaffold3	15	10	csi-60											GCUCCCAAAUUUACGAAACGUUCCGGGG
C1	scaffold3	18	25	csi-60											GCUCCCAAAUUUACGAAACGUUCCGGGG
C2	scaffold3	25	18	csi-60											GCUCCCAAAUUUACGAAACGUUCCGGGG
D1	scaffold3	30	35												GCUCCCAAAUUUACGAAACGUUCCGGGG
D2	scaffold3	35	30												GCUCCCAAAUUUACGAAACGUUCCGGGG
E1	scaffold3	40	50	csi-60											GCUCCCAAAUUUACGAAACGUUCCGGGG
E2	scaffold3	50	40	csi-60											GCUCCCAAAUUUACGAAACGUUCCGGGG
F1	scaffold3	52	57	csi-60											GCUCCCAAAUUUACGAAACGUUCCGGGG
F2	scaffold3	57	52	csi-60											GCUCCCAAAUUUACGAAACGUUCCGGGG
G1	scaffold3	60	75	csi-60											GCUCCCAAAUUUACGAAACGUUCCGGGG
G2	scaffold3	75	60	csi-60											GCUCCCAAAUUUACGAAACGUUCCGGGG
H1	scaffold3	80	85	csi-60											GCUCCCAAAUUUACGAAACGUUCCGGGG
H2	scaffold3	85	80	csi-60											GCUCCCAAAUUUACGAAACGUUCCGGGG
I1	scaffold4	11	52	csi-60											GCUCCCAAAUUUACGAAACGUUCCGGGG

dani-sampleimage.txt (0.32 KB)

-->Code1-->					<-Code2<-
----------------------------------------------------------------------------
   ___A____ ___B_ __C_ 	 	____D____	___E___  __F___ __G__   _H__

For each hairpin (A .. H)
which coding region (aka gene) are they closest too?
do they overlap, lie outside, or lie inside of this coding region (aka gene)?

2 Contributors
42 Replies
593 Views
4 Weeks Discussion Span
Latest Post 12 Years Ago Latest Post by bio-grad

d5e5 109 Master Poster

12 Years Ago

Does the following work for you?

#!/usr/bin/perl;
#comparepremiRNAtocoding-pulloverlap3.pl;
use strict;
use warnings;
use List::Util qw[min max];
use Data::Dumper;

#You can delete the following line. My testing setup works better without command-line arguments
@ARGV = qw(dani-sampleHairpindata.txt dani-sampleGenedata.txt) if @ARGV == 0;

my $hairpin_filename = $ARGV[0];
my $gene_filename = $ARGV[1];

my @genedata = read_genedata();
chomp @genedata;

my %h_hash = ();
read_hairpindata();

#print Dumper(\%h_hash);#Dump hairpin data structure
#We can access the data for each hairpin id in each scaffold location
#by looping through %h_hash
my $header = 'Hairpin_ID, hairpin other, outside/inside/overlap,'
               . 'gene ID, gene other, distance from hairpin, scaffold,'
               . 'hairpin start, hairpin stop, hairpin sequence';#Heading line
               
print $header, "\n";
        
foreach my $s(keys %h_hash){
    foreach my $h(keys %{$h_hash{$s}}){
        #dereference the genedata array
        my ($g_id, $g_scaffold, $g_start, $g_stop, $g_other, $dist) = @{$h_hash{$s}{$h}{nearest_gene_data}};
        
        #Dereference array reference and join array elements
        my $hairpin_other = join '', @{$h_hash{$s}{$h}{other}};
        my $hairpin_start = $h_hash{$s}{$h}{start};
        my $hairpin_stop = $h_hash{$s}{$h}{stop};
        my $hairpin_seq = $h_hash{$s}{$h}{sequence_read};
        
        if (defined $g_id){
            my $oio = calc_rel_pos($hairpin_start, $hairpin_stop, $g_start, $g_stop);
            print "$h,$hairpin_other,$oio,$g_id,$g_other,$dist,$s,$hairpin_start,$hairpin_stop,$hairpin_seq\n";
        }
        else{
            print "$h,$hairpin_other,,,,,$s,$hairpin_start,$hairpin_stop,$hairpin_seq\n";
        }
    }
}

sub read_genedata{   
    open(my $fh, '<', $gene_filename)||die "open $gene_filename failed: $!";
    
    #Read entire genedata file into an array
    return <$fh>;
}

sub read_hairpindata{
    open(my $fh, '<', $hairpin_filename)||die "open $hairpin_filename failed: $!";
    
    #Read entire hairpindata file into a hash
    while(<$fh>){
        chomp;
        my @cols = split/\t/;#split by tab
        my $scaffold_loc = $cols[1];#make column 2 (index 1) the scaffold location key (i.e. the scaffolds);
        my $hairpin_id = $cols[0];#hairpin_id also needs to be a key so you can look up data later
        $h_hash{$scaffold_loc}{$hairpin_id}{start} = $cols[2];
        $h_hash{$scaffold_loc}{$hairpin_id}{stop} = $cols[3];
        $h_hash{$scaffold_loc}{$hairpin_id}{other} = [@cols[4..14]];
        $h_hash{$scaffold_loc}{$hairpin_id}{sequence_read} = $cols[15];
        $h_hash{$scaffold_loc}{$hairpin_id}{nearest_gene_data} = nearest_gene($scaffold_loc, $hairpin_id);
    }
}

sub nearest_gene{
    my ($s, $h) = @_; #scaffold location and hairpin_id received as arguments
    
    my $save_min = 999999;#Big number for comparing distances
    my @gene_rec;
    
    foreach (@genedata){
        chomp;
        my @c = split;
        my ($g_id, $g_scaffold, $g_start, $g_stop, $g_other) = split /\s+/, $_, 6;
        #Added the following statement to test only genes with matching scaffold loc
        next unless ($c[1] eq $s) and ($c[4] eq 'gene');#Test only when other info contains 'gene'
        
        my $min = min( abs($h_hash{$s}{$h}{start} - $g_start),
                    abs($h_hash{$s}{$h}{stop} - $g_start),
                    abs($h_hash{$s}{$h}{start} - $g_stop),
                    abs($h_hash{$s}{$h}{stop} - $g_stop) );
        if ($min < $save_min){
            $save_min = $min;
            @gene_rec = ($g_id, $g_scaffold, $g_start, $g_stop, $g_other,$save_min);
        }
    }
    return \@gene_rec;
}

sub calc_rel_pos{
    my ($h_start, $h_stop, $g_start, $g_stop) = @_;
    
    if (is_between($h_start, $g_start, $g_stop)
        && is_between($h_stop, $g_start, $g_stop)){
        return 'inside';
    }
    
    if (!is_between($h_start, $g_start, $g_stop)
    && is_between($h_stop, $g_start, $g_stop)
    ||
    is_between($h_start, $g_start, $g_stop)
    && !is_between($h_stop, $g_start, $g_stop)){
    return 'overlap';
    }
    
    if (!is_between($h_start, $g_start, $g_stop)
    && !is_between($h_stop, $g_start, $g_stop)){
        return 'outside';
    }
}

sub is_between{
    my ($x, $lower_limit, $upper_limit) = @_;
    return ( $x >= $lower_limit && $x <= $upper_limit );
}

Output:

Hairpin_ID, hairpin other, outside/inside/overlap,gene ID, gene other, distance from hairpin, scaffold,hairpin start, hairpin stop, hairpin sequence
I1,csi-60,,,,,scaffold4,11,52,GCUCCCAAAUUUACGAAACGUUCCGGGG
G1,csi-60,outside,Code2,gene,5,scaffold3,60,75,GCUCCCAAAUUUACGAAACGUUCCGGGG
D2,,outside,Code1,gene,10,scaffold3,35,30,GCUCCCAAAUUUACGAAACGUUCCGGGG
A1,csi-60,overlap,Code1,gene,2,scaffold3,1,7,GCUCCCAAAUUUACGAAACGUUCCGGGG
A2,csi-60,overlap,Code1,gene,2,scaffold3,7,1,GCUCCCAAAUUUACGAAACGUUCCGGGG
E1,csi-60,outside,Code2,gene,5,scaffold3,40,50,GCUCCCAAAUUUACGAAACGUUCCGGGG
C1,csi-60,overlap,Code1,gene,2,scaffold3,18,25,GCUCCCAAAUUUACGAAACGUUCCGGGG
G2,csi-60,outside,Code2,gene,5,scaffold3,75,60,GCUCCCAAAUUUACGAAACGUUCCGGGG
F2,csi-60,outside,Code2,gene,7,scaffold3,57,52,GCUCCCAAAUUUACGAAACGUUCCGGGG
B2,csi-60,inside,Code1,gene,5,scaffold3,15,10,GCUCCCAAAUUUACGAAACGUUCCGGGG
C2,csi-60,overlap,Code1,gene,2,scaffold3,25,18,GCUCCCAAAUUUACGAAACGUUCCGGGG
B1,csi-60,inside,Code1,gene,5,scaffold3,10,15,GCUCCCAAAUUUACGAAACGUUCCGGGG
H1,csi-60,outside,Code2,gene,15,scaffold3,80,85,GCUCCCAAAUUUACGAAACGUUCCGGGG
H2,csi-60,outside,Code2,gene,15,scaffold3,85,80,GCUCCCAAAUUUACGAAACGUUCCGGGG
D1,,outside,Code1,gene,10,scaffold3,30,35,GCUCCCAAAUUUACGAAACGUUCCGGGG
E2,csi-60,outside,Code2,gene,5,scaffold3,50,40,GCUCCCAAAUUUACGAAACGUUCCGGGG
F1,csi-60,outside,Code2,gene,7,scaffold3,52,57,GCUCCCAAAUUUACGAAACGUUCCGGGG

Reply to this topic

Be a part of the DaniWeb community

We're a friendly, industry-focused community of developers, IT pros, digital marketers, and technology enthusiasts meeting, networking, learning, and sharing knowledge.

bio-grad 0 Junior Poster in Training · Answer 1 · 2011-08-05T01:11:19+00:00

perhaps, rather than a b-tree, someone could help me put together a method of finding the "gene" with the minimum distance to the "hairpin" by doing something like

my @dist = ( abs($h_hash{$c[1]}[$i][2] - $c[2]), abs($h_hash{$c[1]}[$i][3] - $c[2]), abs($h_hash{$c[1]}[$i][2] - $c[3]), abs($h_hash{$c[1]}[$i][3] - $c[3]) );

@dist = sort {$a <==> $b} @dist;
$min = $dist[0]; # this will find the distance to a single gene and would need to be done again for every array with a matching key (scaffold) and then I would want to find the min of each of these.

I just don't know how to reference back into the hash once the entry with the minimum distance from the hairpin is determined.

d5e5 109 Master Poster · Answer 2 · 2011-08-05T02:41:27+00:00

perhaps, rather than a b-tree, someone could help me put together a method of finding the "gene" with the minimum distance to the "hairpin" by doing something like
my @dist = ( abs($h_hash{$c[1]}[$i][2] - $c[2]), abs($h_hash{$c[1]}[$i][3] - $c[2]), abs($h_hash{$c[1]}[$i][2] - $c[3]), abs($h_hash{$c[1]}[$i][3] - $c[3]) );
@dist = sort {$a <==> $b} @dist;
$min = $dist[0]; # this will find the distance to a single gene and would need to be done again for every array with a matching key (scaffold) and then I would want to find the min of each of these.
I just don't know how to reference back into the hash once the entry with the minimum distance from the hairpin is determined.

Due to my unfamiliarity with the b-tree concept, I would favour your second idea for finding the minimum distance. I don't have any more computer time today but will try to find a way tomorrow.

What I might suggest (but haven't worked out yet) is using the Tie:File module to treat the Genedata file as an array that you can loop through repeatedly. As you find the nearest gene for each hairpin you could save the index of the Genedata array in the hairpin hash, or in another hash that links hairpin with nearest gene. I think that's promising but won't know if it works until I try it tomorrow.

bio-grad 0 Junior Poster in Training · Answer 3 · 2011-08-05T09:19:42+00:00

Hi d5e5!

Thanks for checking my post. Your suggestions sounds promising. I am trying to work on it with another coworker tomorrow and will post if we come up with something. How does the rest of the code look? I believe this was originally some of your code from another project you helped me with last year. I've modified it a good bit since then, and I really appreciate your help once again.

d5e5 109 Master Poster · Answer 4 · 2011-08-05T21:18:49+00:00

One quick question and then I hope to have some time this afternoon: I see your program loads a module called FileHandle but doesn't seem to do anything with any of that module's properties or methods. I'm not familiar with the FileHandle module. Is it an important part of your solution, or can I ignore it for now?

Avoid using single-letter bare words such as 'H' and 'C' when opening files. It may not have caused you a problem yet but it seems like fingernails on chalkboard to Perl programmers.

# Bareword filehandles such as SARAN are package globals. Use lexical filehandles.
# Prefer the three-argument form of open, especially if the filename is not hardcoded.
# Include the filename in the error message.

~Comment by Sinan Ünür on StackOverflow

The data structure in your hash looks unnecessarily complex but I'll have to experiment with that this afternoon before recommending a change. In principle, with the right data structures you can refer to any hairpin scaffold data by its key. I'll post more this afternoon.

bio-grad 0 Junior Poster in Training · Answer 5 · 2011-08-05T21:42:08+00:00

As I mentioned before, this is some old code someone helped me with. I believe you can ignore FileHandle for now and thanks for the fyi on the single-letter words when opening files... btw, can you tell me why we have call a file by two things (ie. H, $hairpin)?

d5e5 109 Master Poster · Answer 6 · 2011-08-06T00:39:50+00:00

As I mentioned before, this is some old code someone helped me with. I believe you can ignore FileHandle for now and thanks for the fyi on the single-letter words when opening files... btw, can you tell me why we have call a file by two things (ie. H, $hairpin)?

I'm not sure I understand the question but the open statement associates a file handle with a file name. The file name consists of the path and name of the file. The file handle represents a reference to some location in memory for internal use by perl. I'm not a computer scientist so that may not be exactly correct.

The following is obviously not complete, but I modified the structure of the hairpin hash so that the hairpin id can serve as a key within the scaffold location hash. The goal is to be able to store and look up the saved data for each hairpin in the scaffold. Let me know if this structure seems better or worse than what you had.

#!/usr/bin/perl;
#comparepremiRNAtocoding-pulloverlap3.pl;
use strict;
use warnings;
use Data::Dumper;

#You can delete the following line. My testing setup works better without command-line arguments
@ARGV = qw(dani-sampleHairpindata.txt dani-sampleGenedata.txt) if @ARGV == 0;

my $hairpin_filename = $ARGV[0];
my $gene_filename = $ARGV[1];

#open a file for reading and associate it with a lexical filehandle
open(my $fh, '<', $hairpin_filename)||die "open $hairpin_filename failed: $!";

my %h_hash=();
while(<$fh>)  # first load h into hash of arrays
{
  chomp;
  my @cols = split/\t/;#split by tab
  my $scaffold_loc = $cols[1];#make column 2 (index 1) the scaffold location key (i.e. the scaffolds);
  my $hairpin_id = $cols[0];#hairpin_id also needs to be a key so you can look up data later
  $h_hash{$scaffold_loc}{$hairpin_id} = [@cols];
}
close $fh;
print Dumper(\%h_hash);#Dump hairpin data structure

#Now if we want to lookup the data for, for example hairpin E2 in scaffold3
#we can do it as follows:
print join ', ', @{$h_hash{scaffold3}{E2}}, "\n";

#Open another file. We *could* associate it with a different filehandle
#but since $fh has been closed, we can use it again.
open($fh, '<', $gene_filename)||die "open $gene_filename failed: $!";

while(<$fh>){#Read genedata file line by line
    #calculate distance of gene to each hairpin
}
close $fh;

bio-grad 0 Junior Poster in Training · Answer 7 · 2011-08-06T02:13:55+00:00

I may need to see more of the code to see where you are going with this, but so far I'm confused as to why the hairpin id becomes a new hash key rather than a gene id as the idea is to compare the distances between a series of gene start and stops on a scaffold to the hairpin on that scaffold and then determine the gene with the minimum distance to the hairpin start or stop. I would have thought that the gene id that is found to be the min would become a key that could then be used to pull the data from that gene id to be printed in the output. But I'm a total newbie, so I will defer to you and wait to see more code. Thanks again for the help.
Btw, I won't be able to check back in until tomorrow evening EST.

d5e5 109 Master Poster · Answer 8 · 2011-08-06T02:33:32+00:00

I may need to see more of the code to see where you are going with this, but so far I'm confused as to why the hairpin id becomes a new hash key rather than a gene id as the idea is to compare the distances between a series of gene start and stops on a scaffold to the hairpin on that scaffold and then determine the gene with the minimum distance to the hairpin start or stop. I would have thought that the gene id that is found to be the min would become a key that could then be used to pull the data from that gene id to be printed in the output. But I'm a total newbie, so I will defer to you and wait to see more code. Thanks again for the help.
Btw, I won't be able to check back in until tomorrow evening EST.

You're right, the script I posted hasn't done anything with the gene data yet. It seemed strange to me that the hairpin hash had only scaffold as key, as scaffold is not unique. I may still be confused. I'll have another look on the weekend.

d5e5 109 Master Poster · Answer 9 · 2011-08-07T01:59:37+00:00

I may need to see more of the code to see where you are going with this, but so far I'm confused as to why the hairpin id becomes a new hash key rather than a gene id as the idea is to compare the distances between a series of gene start and stops on a scaffold to the hairpin on that scaffold and then determine the gene with the minimum distance to the hairpin start or stop. I would have thought that the gene id that is found to be the min would become a key that could then be used to pull the data from that gene id to be printed in the output. But I'm a total newbie, so I will defer to you and wait to see more code. Thanks again for the help.
Btw, I won't be able to check back in until tomorrow evening EST.

The following reads the gene data into an array, then builds the hash of hairpin data, which also includes a reference to the row of data from the genedata file that is nearest to each hairpin. So far it just dumps the contents of the hash linking hairpins to the nearest gene data record. Please let me know if I'm on the right track.

#!/usr/bin/perl;
#comparepremiRNAtocoding-pulloverlap3.pl;
use strict;
use warnings;
use List::Util qw[min max];
use Data::Dumper;

#You can delete the following line. My testing setup works better without command-line arguments
@ARGV = qw(dani-sampleHairpindata.txt dani-sampleGenedata.txt) if @ARGV == 0;

my $hairpin_filename = $ARGV[0];
my $gene_filename = $ARGV[1];

my @genedata = read_genedata();
chomp @genedata;

my %h_hash = ();
read_hairpindata();

print Dumper(\%h_hash);#Dump hairpin data structure

sub read_genedata{   
    open(my $fh, '<', $gene_filename)||die "open $gene_filename failed: $!";
    
    #Read entire genedata file into an array
    return <$fh>;
}

sub read_hairpindata{
    open(my $fh, '<', $hairpin_filename)||die "open $hairpin_filename failed: $!";
    
    #Read entire hairpindata file into a hash
    while(<$fh>){
        chomp;
        my @cols = split/\t/;#split by tab
        my $scaffold_loc = $cols[1];#make column 2 (index 1) the scaffold location key (i.e. the scaffolds);
        my $hairpin_id = $cols[0];#hairpin_id also needs to be a key so you can look up data later
        $h_hash{$scaffold_loc}{$hairpin_id}{start} = $cols[2];
        $h_hash{$scaffold_loc}{$hairpin_id}{stop} = $cols[3];
        $h_hash{$scaffold_loc}{$hairpin_id}{other} = [@cols[4..14]];
        $h_hash{$scaffold_loc}{$hairpin_id}{sequence_read} = $cols[15];
        $h_hash{$scaffold_loc}{$hairpin_id}{nearest_gene_data} = nearest_gene($scaffold_loc, $hairpin_id);
    }
}

sub nearest_gene{
    my ($s, $h) = @_; #scaffold location and hairpin_id received as arguments
    return "Gene not found for $s $h" if not exists $h_hash{$s}{$h};
    
    my $save_min = 999999;#Big number for comparing distances
    my @gene_rec;
    
    foreach (@genedata){
        chomp;
        my @c = split;
        my $min = min( abs($h_hash{$s}{$h}{start} - $c[2]),
                    abs($h_hash{$s}{$h}{stop} - $c[2]),
                    abs($h_hash{$s}{$h}{start} - $c[3]),
                    abs($h_hash{$s}{$h}{stop} - $c[3]) );
        if ($min < $save_min){
            $save_min = $min;
            @gene_rec = @c;
        }
    }
    return \@gene_rec;
}

d5e5 109 Master Poster · Answer 10 · 2011-08-07T21:10:10+00:00

I had to modify the function called nearest_gene to consider only genes whose scaffold location matched that of the hairpin. I also modified the variable names for the column data for readability. Please replace the corresponding subroutine in the above script with the following:

sub nearest_gene{
    my ($s, $h) = @_; #scaffold location and hairpin_id received as arguments
    
    my $save_min = 999999;#Big number for comparing distances
    my @gene_rec;
    
    foreach (@genedata){
        chomp;
        my @c = split;
        my ($g_id, $g_scaffold, $g_start, $g_stop, $g_other) = split /\s+/, $_, 5;
        #Added the following statement to test only genes with matching scaffold loc
        next unless $c[1] eq $s;
        
        my $min = min( abs($h_hash{$s}{$h}{start} - $g_start),
                    abs($h_hash{$s}{$h}{stop} - $g_start),
                    abs($h_hash{$s}{$h}{start} - $g_stop),
                    abs($h_hash{$s}{$h}{stop} - $g_stop) );
        if ($min < $save_min){
            $save_min = $min;
            @gene_rec = ($g_id, $g_scaffold, $g_start, $g_stop, $g_other);
        }
    }
    return \@gene_rec;
}

bio-grad 0 Junior Poster in Training · Answer 11 · 2011-08-07T21:59:47+00:00

I read through the code and there are some bits I'm not following that maybe you could explain? line 17-20 - I don't understand what is happening here. I see you open %h_hash, but I don't understand what is happening next. Is read_hairpindata being loaded into h_hash?

Also, I don't quite follow the work flow as I'm not used to subs calling on other subs. Can you give me a basic run through?

Anyhow, I think you are on the right track. I'll check back in a bit later today.

Thanks again!

d5e5 109 Master Poster · Answer 12 · 2011-08-08T02:06:12+00:00

I read through the code and there are some bits I'm not following that maybe you could explain? line 17-20 - I don't understand what is happening here. I see you open %h_hash, but I don't understand what is happening next. Is read_hairpindata being loaded into h_hash?
Also, I don't quite follow the work flow as I'm not used to subs calling on other subs. Can you give me a basic run through?
Anyhow, I think you are on the right track. I'll check back in a bit later today.
Thanks again!

One of the differences between the read_genedata function, or subroutine ('function' and 'subroutine' mean the same thing in Perl), and the read_hairpindata function is read_genedata returns a list of all the lines in the genedata file which the calling statement assigns to the @genedata array.

Whereas the read_hairpindata function does not return any data. Instead of returning something it does something. What it does is add data elements (in the form of hash and array references) to the %h_hash which we have to declare outside the read_hairpindata subroutine so we can later access it from outside the scope of that subroutine... so that's what's going on in lines 17 through 20:

Line 17 declares the %h_hash wherin we want to save the hairpin and corresponding nearest gene data.
line 18 calls the read_hairpindata subroutine in a void context meaning it doesn't assign the result to any variable, because the function doesn't return anything, but instead adds data to the %h_hash variable, thereby making it into a complex data structure which links each hairpin to the line of genedata that is the nearest to that hairpin.
Line 20 prints a dump of what's in %h_hash for our own information while we're writing and testing the program. Once we know exactly what data elements we want to retrieve from this hash, and figure out how to convert the references back into useable text, numbers, lists, or whatever, then we will remove line 20 because we won't need to see a data dump any more.

Remember: hashes can really only contain pairs of scalar elements, so to store arrays or other hashes in a hash we have to store references to them. We can later de-reference these references to retrieve what they refer to.

Note that line 42: $h_hash{$scaffold_loc}{$hairpin_id}{nearest_gene_data} = nearest_gene($scaffold_loc, $hairpin_id); within the read_hairpindata function calls the nearest_gene function and passes it the scaffold location and hairpin id of the hairpin for which the function will try to return the element in the @genedata array calculated to have the least distance from the hairpin.

One thing I don't understand is why the genedata file has many lines of data -- and therefore many pairs of stop and start locations -- for most genes. Since gene_id is not unique, I'm not sure which line I'm supposed to save for that gene, so I return the line whose start and stop locations have the least distance from the hairpin.

bio-grad 0 Junior Poster in Training · Answer 13 · 2011-08-08T03:01:57+00:00

Thank you for explaining the hash system and the referencing. I'll have to look into this a bit further so I can use this setup again down the road.

As for the gene file...The gene file has lots of data. This is because of the biology of the thing it is referencing. Essentially this data set tells me the boundaries of the gene (genes: Code1, JG12345 etc.)and which elements are present at which locations within genes (these are identified with the "transcript_id" prefix or ".t" suffix in the ID column).

I am determining which hairpins lie "outside" of a gene, "overlap" a gene, or are "inside" a gene. The element locations within the genes only become relevant when a hairpin is determined to be "inside" the gene. When it is within the boundaries of the gene the most critical pieces of information are the lines indicating 5'UTR, intron, CDS, or 3'UTR in the info column. If a hairpin location falls within one of these regions it is important and my plan is to output the gene info column with the hairpin information so that I know hairpinX resides within the 5'UTR of gene Code1.

If the hairpin lies distant to a gene then the elements within the gene become irrelevant and I was planning to delete them as redundant entries for any hit that was found to be "outside" a gene. In this case I just need to know that hairpinX resides 324 bases away from Code2.

This is why the original code is set to determine if the hairpin is within a gene, if it is not, then it determines the distance between it and genes nearby to find the closest one. If neither of the above scenarios work, then the hairpin is said to overlap a gene (ie. it lies both inside and outside of a gene) and only the gene ID is needed.

Does this help or am I being unclear?

d5e5 109 Master Poster · Answer 14 · 2011-08-09T00:38:10+00:00

Thanks for the explanation. I gather that some of the records in the genedata file may be used only later if certain conditions are met. I think this means that the function that searches for the nearest gene should calculate the distance from hairpin only for those records where the 'other' value is 'gene'. Is that correct? If so we could modify the script to test only the 'gene' rows. I also included an example of how to loop through and access the data stored in the %h_hash. The script still doesn't determine if the hairpin is inside, outside or overlapping the nearest gene, but knowing how to access the data elements in %h_hash is a step in that direction.

#!/usr/bin/perl;
#comparepremiRNAtocoding-pulloverlap3.pl;
use strict;
use warnings;
use List::Util qw[min max];
use Data::Dumper;

#You can delete the following line. My testing setup works better without command-line arguments
@ARGV = qw(dani-sampleHairpindata.txt dani-sampleGenedata.txt) if @ARGV == 0;

my $hairpin_filename = $ARGV[0];
my $gene_filename = $ARGV[1];

my @genedata = read_genedata();
chomp @genedata;

my %h_hash = ();
read_hairpindata();

#print Dumper(\%h_hash);#Dump hairpin data structure
#We can access the data for each hairpin id in each scaffold location
#by looping through %h_hash
foreach my $s(keys %h_hash){
    print "Scaffold location $s\n";
    
    foreach my $h(keys %{$h_hash{$s}}){
        my ($g_id, $g_scaffold, $g_start, $g_stop, $g_other) = $h_hash{$s}{$h}{nearest_gene_data}[0];
        my $nearest_gene = 'Not Found';
        $nearest_gene = $g_id if defined $g_id;
        print "Nearest Gene to Hairpin id $h is $nearest_gene\n";
    }
    print "\n";
}

sub read_genedata{   
    open(my $fh, '<', $gene_filename)||die "open $gene_filename failed: $!";
    
    #Read entire genedata file into an array
    return <$fh>;
}

sub read_hairpindata{
    open(my $fh, '<', $hairpin_filename)||die "open $hairpin_filename failed: $!";
    
    #Read entire hairpindata file into a hash
    while(<$fh>){
        chomp;
        my @cols = split/\t/;#split by tab
        my $scaffold_loc = $cols[1];#make column 2 (index 1) the scaffold location key (i.e. the scaffolds);
        my $hairpin_id = $cols[0];#hairpin_id also needs to be a key so you can look up data later
        $h_hash{$scaffold_loc}{$hairpin_id}{start} = $cols[2];
        $h_hash{$scaffold_loc}{$hairpin_id}{stop} = $cols[3];
        $h_hash{$scaffold_loc}{$hairpin_id}{other} = [@cols[4..14]];
        $h_hash{$scaffold_loc}{$hairpin_id}{sequence_read} = $cols[15];
        $h_hash{$scaffold_loc}{$hairpin_id}{nearest_gene_data} = nearest_gene($scaffold_loc, $hairpin_id);
    }
}

sub nearest_gene{
    my ($s, $h) = @_; #scaffold location and hairpin_id received as arguments
    
    my $save_min = 999999;#Big number for comparing distances
    my @gene_rec;
    
    foreach (@genedata){
        chomp;
        my @c = split;
        my ($g_id, $g_scaffold, $g_start, $g_stop, $g_other) = split /\s+/, $_, 5;
        #Added the following statement to test only genes with matching scaffold loc
        next unless ($c[1] eq $s) and ($c[4] eq 'gene');#Test only when other info contains 'gene'
        
        my $min = min( abs($h_hash{$s}{$h}{start} - $g_start),
                    abs($h_hash{$s}{$h}{stop} - $g_start),
                    abs($h_hash{$s}{$h}{start} - $g_stop),
                    abs($h_hash{$s}{$h}{stop} - $g_stop) );
        if ($min < $save_min){
            $save_min = $min;
            @gene_rec = ($g_id, $g_scaffold, $g_start, $g_stop, $g_other);
        }
    }
    return \@gene_rec;
}

Sorry, I didn't sleep much last night. A family crisis came up which may keep me away from the computer tomorrow and Wednesday at least, so if I don't answer right away that's why.

bio-grad 0 Junior Poster in Training · Answer 15 · 2011-08-09T01:11:21+00:00

Wow, thanks so much. As for the function that searches for the nearest gene, it should calculate the distance from hairpin only for those records where the 'other' value is 'gene' or empty (in the cases where the ID begins with JG, the 'other' value is empty as there is not as much data for these genes. I'll look this over and try to incorporate into my original script which tests for the "inside" "outside" "overlap" and get back to you after I see how it goes.

I hope all goes well as you deal with your "crisis". Those are never a picnic. Take care.

bio-grad 0 Junior Poster in Training · Answer 16 · 2011-08-10T21:53:42+00:00

Update - The code works :) Thanks.

Once I've figured out how to access the values in the hash I can use the values to calculate the outside/inside/overlap status. I'm still working through that by trying to figure out how to incorporate $save_min into the output. I tried adding it to the @gene_rec and then including it in line 27, so I could reference it in the print statement, but I keep getting an error that it is uninitialized. I also want to access other data that is part of @gene_rec, but when I tried to reference them from the print statement, I received the same error as with $save_min which I had added to line 27 and in line 65.

Am I understanding things correctly that @gene_rec data can be accessed through $h_hash{$s}{$h}{nearest_gene_data} with $h_hash{$s}{$h}{nearest_gene_data}[0] being the $g_id and $h_hash{$s}{$h}{nearest_gene_data}[4] being the $g_other?

Ideally the output would look like this:

Hairpin_ID, hairpin other, "outside/inside/overlap", gene ID, gene other, distance from hairpin, scaffold, hairpin start, hairpin stop, hairpin sequence

d5e5 109 Master Poster · Answer 17 · 2011-08-11T02:25:44+00:00

Since you have added a value to the array I think you need to increase the limit argument for the split from 5 to 6.

Try modifying line 69 from: my ($g_id, $g_scaffold, $g_start, $g_stop, $g_other) = split /\s+/, $_, 5; to: my ($g_id, $g_scaffold, $g_start, $g_stop, $g_other) = split /\s+/, $_, 6;

Am I understanding things correctly that @gene_rec data can be accessed through $h_hash{$s}{$h}{nearest_gene_data} with $h_hash{$s}{$h}{nearest_gene_data}[0] being the $g_id and $h_hash{$s}{$h}{nearest_gene_data}[4] being the $g_other?

Yes, that part is correct as far as I can tell.

d5e5 109 Master Poster · Answer 18 · 2011-08-11T03:04:07+00:00

Sorry, I found my mistake on line 27. See the comment and the modified statement (now line 28) of the following:

#!/usr/bin/perl;
#comparepremiRNAtocoding-pulloverlap3.pl;
use strict;
use warnings;
use List::Util qw[min max];
use Data::Dumper;

#You can delete the following line. My testing setup works better without command-line arguments
@ARGV = qw(dani-sampleHairpindata.txt dani-sampleGenedata.txt) if @ARGV == 0;

my $hairpin_filename = $ARGV[0];
my $gene_filename = $ARGV[1];

my @genedata = read_genedata();
chomp @genedata;

my %h_hash = ();
read_hairpindata();

#print Dumper(\%h_hash);#Dump hairpin data structure
#We can access the data for each hairpin id in each scaffold location
#by looping through %h_hash
foreach my $s(keys %h_hash){
    print "Scaffold location $s\n";
    
    foreach my $h(keys %{$h_hash{$s}}){
        #Modified following statement to properly dereference the array
        my ($g_id, $g_scaffold, $g_start, $g_stop, $g_other, $dist) = @{$h_hash{$s}{$h}{nearest_gene_data}};
        my $nearest_gene = 'Not Found';
        $nearest_gene = $g_id if defined $g_id;
        print "Nearest Gene to Hairpin id $h is $nearest_gene, distance: $dist\n";
    }
    print "\n";
}

sub read_genedata{   
    open(my $fh, '<', $gene_filename)||die "open $gene_filename failed: $!";
    
    #Read entire genedata file into an array
    return <$fh>;
}

sub read_hairpindata{
    open(my $fh, '<', $hairpin_filename)||die "open $hairpin_filename failed: $!";
    
    #Read entire hairpindata file into a hash
    while(<$fh>){
        chomp;
        my @cols = split/\t/;#split by tab
        my $scaffold_loc = $cols[1];#make column 2 (index 1) the scaffold location key (i.e. the scaffolds);
        my $hairpin_id = $cols[0];#hairpin_id also needs to be a key so you can look up data later
        $h_hash{$scaffold_loc}{$hairpin_id}{start} = $cols[2];
        $h_hash{$scaffold_loc}{$hairpin_id}{stop} = $cols[3];
        $h_hash{$scaffold_loc}{$hairpin_id}{other} = [@cols[4..14]];
        $h_hash{$scaffold_loc}{$hairpin_id}{sequence_read} = $cols[15];
        $h_hash{$scaffold_loc}{$hairpin_id}{nearest_gene_data} = nearest_gene($scaffold_loc, $hairpin_id);
    }
}

sub nearest_gene{
    my ($s, $h) = @_; #scaffold location and hairpin_id received as arguments
    
    my $save_min = 999999;#Big number for comparing distances
    my @gene_rec;
    
    foreach (@genedata){
        chomp;
        my @c = split;
        my ($g_id, $g_scaffold, $g_start, $g_stop, $g_other) = split /\s+/, $_, 6;
        #Added the following statement to test only genes with matching scaffold loc
        next unless ($c[1] eq $s) and ($c[4] eq 'gene');#Test only when other info contains 'gene'
        
        my $min = min( abs($h_hash{$s}{$h}{start} - $g_start),
                    abs($h_hash{$s}{$h}{stop} - $g_start),
                    abs($h_hash{$s}{$h}{start} - $g_stop),
                    abs($h_hash{$s}{$h}{stop} - $g_stop) );
        if ($min < $save_min){
            $save_min = $min;
            @gene_rec = ($g_id, $g_scaffold, $g_start, $g_stop, $g_other,$save_min);
        }
    }
    return \@gene_rec;
}

I still get one uninitialized value error at line 31 so the script needs more work.

bio-grad 0 Junior Poster in Training · Answer 19 · 2011-08-11T09:50:03+00:00

seems to me that perhaps the trouble lies with $dist when in fact there is no gene present on the scaffold. therefor maybe we could modify line 29 to read:

$nearest_gene = $g_id if defined $g_id && $dist>0;

and then

else print "Nearest Gene to Hairpin id $h is $nearest_gene\n";

would that work? I tried to put it in, but I seem to be messing up my parenthesis or something with the else statement.

d5e5 109 Master Poster · Answer 20 · 2011-08-13T01:10:16+00:00

seems to me that perhaps the trouble lies with $dist when in fact there is no gene present on the scaffold. therefor maybe we could modify line 29 to read:
$nearest_gene = $g_id if defined $g_id && $dist>0;
and then
else print "Nearest Gene to Hairpin id $h is $nearest_gene\n";
would that work? I tried to put it in, but I seem to be messing up my parenthesis or something with the else statement.

I think you have identified the problem of the undefined distance when no gene found correctly but your proposed solution won't work because $nearest_gene = $g_id if defined $g_id && $dist>0; contains a post-condition if followed by a semi-colon which ends the statement so the following else condition is orphaned (doesn't follow an if block). See http://perldoc.perl.org/perlintro.html and scroll down to

However, there is a clever way of making your one-line conditional blocks more English like:
1. # the traditional way
2. if ($zippy) {
3. print "Yow!";
4. }
5.
6. # the Perlish post-condition way
7. print "Yow!" if $zippy;
8. print "We have no bananas" unless $bananas;

We can try replacing the lines 21 through 34 with the following

#We can access the data for each hairpin id in each scaffold location
#by looping through %h_hash
foreach my $s(keys %h_hash){
    print "Scaffold location $s\n";
    
    foreach my $h(keys %{$h_hash{$s}}){
        #Modified following statement to properly dereference the array
        my ($g_id, $g_scaffold, $g_start, $g_stop, $g_other, $dist) = @{$h_hash{$s}{$h}{nearest_gene_data}};
        
        if (defined $g_id){
            print "Nearest Gene to Hairpin id $h is $g_id, distance: $dist\n";
        }
        else{
            print "No gene matches Hairpin id $h\n";
        }
    }
    print "\n";
}

This contains an if condition followed by a statement block and an else condition followed by another statement block, instead of the shorter post if condition.

bio-grad 0 Junior Poster in Training · Answer 21 · 2011-08-15T22:01:30+00:00

I've been running this on the real data since yesterday and it's still running. I'll let you know if it works once I get a chance to examine the output. Thanks again.

d5e5 109 Master Poster · Answer 22 · 2011-08-16T02:40:12+00:00

I've been running this on the real data since yesterday and it's still running. I'll let you know if it works once I get a chance to examine the output. Thanks again.

I didn't expect it to run so long. Unfortunately my wife and I have to travel for the rest of the week and I won't have a computer. Hopefully we'll be back by Friday. Let me know if the script still needs work and I'll have another look at it on the weekend.

bio-grad 0 Junior Poster in Training · Answer 23 · 2011-08-16T22:44:43+00:00

Thanks. Yep, I just restarted it and double checked the output. It isn't even printing the header and the code isn't giving any error messages in the terminal, so I'm not even sure where to begin :(

d5e5 109 Master Poster · Answer 24 · 2011-08-19T21:41:54+00:00

A good first step is to find data that will cause the problem so we can modify the program and test until the problem no longer occurs. Either there are some data in your files that the program doesn't handle properly, or else one or both of the files are so large that the program takes an unacceptably long time to load the data into memory. Can you post the data files that take forever to process? If they are too large to attach as txt files to your post, maybe you can upload them to a free file-sharing service such as http://fyels.com/ which gives you a link to the data allowing anyone to download it.

bio-grad 0 Junior Poster in Training · Answer 25 · 2011-08-19T21:53:34+00:00

I'll upload the files to fyels when I get my computer with the data up and running this afternoon.

d5e5 109 Master Poster · Answer 26 · 2011-08-20T02:12:01+00:00

I'll upload the files to fyels when I get my computer with the data up and running this afternoon.

OK. Remember to post the link to the data on fyels or I won't be able to find it.

Meanwhile I tweaked the program to make it more efficient but since it took less than a second to run the test data I can't tell if it's really any faster than before. Here's the modified program. Note that I use the grep command to filter the genedata before searching for the nearest gene. Plus I sort the genedata by scaffold so I can stop looping through the data after looping through the matching scaffold data. In theory that should save a little time.

#!/usr/bin/perl;
#comparepremiRNAtocoding-pulloverlap3.pl;
use strict;
use warnings;
use List::Util qw[min max];
use Benchmark;

my $t0 = Benchmark->new;

#You can delete the following line. My testing setup works better without command-line arguments
@ARGV = qw(dani-sampleHairpindata.txt dani-sampleGenedata.txt) if @ARGV == 0;

my $hairpin_filename = $ARGV[0];
my $gene_filename = $ARGV[1];
my @genedata = read_genedata();

@genedata = grep {m/^.+$/} @genedata;#Filter array lines to keep only non-blank lines

#Filter array and keep only lines whose other info
#contains 'gene' or gene id begins with JG and other info is empty
@genedata = grep {my ($g_id, $g_scaffold, $g_start, $g_stop, $g_other) = split /\s+/, $_, 6;
                    defined $g_other and $g_other eq 'gene'
                  or
                    defined $g_id and $g_id =~ m/^JG/
                    and !defined $g_other                
                    } @genedata;

chomp @genedata;

@genedata = sort by_scaffold @genedata;

my %h_hash = ();
read_hairpindata();

my $t1 = Benchmark->new;
my $td = timediff($t1, $t0);
print "Reading the data into memory took:",timestr($td),"\n";

#print Dumper(\%h_hash);#Dump hairpin data structure
#We can access the data for each hairpin id in each scaffold location
#by looping through %h_hash
my $header = 'Hairpin_ID, hairpin other, outside/inside/overlap,'
               . 'gene ID, gene other, distance from hairpin, scaffold,'
               . 'hairpin start, hairpin stop, hairpin sequence';#Heading line
               
print $header, "\n";
        
foreach my $s(keys %h_hash){
    foreach my $h(keys %{$h_hash{$s}}){
        #dereference the genedata array
        my ($g_id, $g_scaffold, $g_start, $g_stop, $g_other, $dist) = @{$h_hash{$s}{$h}{nearest_gene_data}};
        
        #Dereference array reference and join array elements
        my $hairpin_other = join '', @{$h_hash{$s}{$h}{other}};
        my $hairpin_start = $h_hash{$s}{$h}{start};
        my $hairpin_stop = $h_hash{$s}{$h}{stop};
        my $hairpin_seq = $h_hash{$s}{$h}{sequence_read};
        
        if (defined $g_id){
            my $oio = calc_rel_pos($hairpin_start, $hairpin_stop, $g_start, $g_stop);
            print "$h,$hairpin_other,$oio,$g_id,$g_other,$dist,$s,$hairpin_start,$hairpin_stop,$hairpin_seq\n";
        }
        else{
            print "$h,$hairpin_other,,,,,$s,$hairpin_start,$hairpin_stop,$hairpin_seq\n";
        }
    }
}

sub read_genedata{   
    open(my $fh, '<', $gene_filename)||die "open $gene_filename failed: $!";
    
    #Read entire genedata file into an array
    return <$fh>;
}

sub read_hairpindata{
    open(my $fh, '<', $hairpin_filename)||die "open $hairpin_filename failed: $!";
    
    #Read entire hairpindata file into a hash
    while(<$fh>){
        chomp;
        my @cols = split/\t/;#split by tab
        my $scaffold_loc = $cols[1];#make column 2 (index 1) the scaffold location key (i.e. the scaffolds);
        my $hairpin_id = $cols[0];#hairpin_id also needs to be a key so you can look up data later
        $h_hash{$scaffold_loc}{$hairpin_id}{start} = $cols[2];
        $h_hash{$scaffold_loc}{$hairpin_id}{stop} = $cols[3];
        $h_hash{$scaffold_loc}{$hairpin_id}{other} = [@cols[4..14]];
        $h_hash{$scaffold_loc}{$hairpin_id}{sequence_read} = $cols[15];
        $h_hash{$scaffold_loc}{$hairpin_id}{nearest_gene_data} = nearest_gene($scaffold_loc, $hairpin_id);
    }
}

sub nearest_gene{
    my ($s, $h) = @_; #scaffold location and hairpin_id received as arguments
    
    my $save_min = 999999;#Big number for comparing distances
    my @gene_rec;
    my ($scaffold_match_start, $scaffold_match_end);
    foreach (@genedata){
        chomp;
        my @c = split;
        my ($g_id, $g_scaffold, $g_start, $g_stop, $g_other) = split /\s+/, $_, 6;
        
        #Test only when gene scaffold matches hairpin scafforld
        $scaffold_match_start = 1 if $g_scaffold eq $s;
        $scaffold_match_end = 1 if $scaffold_match_start and $g_scaffold ne $s;
        next unless $scaffold_match_start;
        last if $scaffold_match_end;#Break out of the loop when finished matching scaffold group
        
        my $min = min( abs($h_hash{$s}{$h}{start} - $g_start),
                    abs($h_hash{$s}{$h}{stop} - $g_start),
                    abs($h_hash{$s}{$h}{start} - $g_stop),
                    abs($h_hash{$s}{$h}{stop} - $g_stop) );

        if ($min < $save_min){
            $save_min = $min;
            @gene_rec = ($g_id, $g_scaffold, $g_start, $g_stop, $g_other,$save_min);
        }
    }
    return \@gene_rec;
}

sub calc_rel_pos{
    my ($h_start, $h_stop, $g_start, $g_stop) = @_;
    
    if (is_between($h_start, $g_start, $g_stop)
        && is_between($h_stop, $g_start, $g_stop)){
        return 'inside';
    }
    
    if (!is_between($h_start, $g_start, $g_stop)
    && is_between($h_stop, $g_start, $g_stop)
    ||
    is_between($h_start, $g_start, $g_stop)
    && !is_between($h_stop, $g_start, $g_stop)){
    return 'overlap';
    }
    
    if (!is_between($h_start, $g_start, $g_stop)
    && !is_between($h_stop, $g_start, $g_stop)){
        return 'outside';
    }
}

sub is_between{
    my ($x, $lower_limit, $upper_limit) = @_;
    return ( $x >= $lower_limit && $x <= $upper_limit );
}

sub by_scaffold{
    my ($id_a, $scaffold_a) = split /\s+/, $a;
    my ($id_b, $scaffold_b) = split /\s+/, $b;
    $scaffold_a cmp $scaffold_b;
}

bio-grad 0 Junior Poster in Training · Answer 27 · 2011-08-20T03:14:03+00:00

I'll run the new script and see if it runs any better. In the meantime, here are the links:

http://tiffanie_yael.fyels.com/bab - coding sequences/genes
http://tiffanie_yael.fyels.com/cab - premiRNA/hairpins

thanks again!

d5e5 109 Master Poster · Answer 28 · 2011-08-21T02:08:58+00:00

I downloaded the two files and tested the program. It seemed to take forever so I made a few more changes that hopefully will speed it up a little more. Also I have it print debugging info to the screen while it's running so we have some idea what it's doing.

I see there are over 168,000 lines of data in the hairpin file. If it took one second to find the nearest gene for each of those lines that would total about 46 hours! Fortunately it seems to run faster than that on my laptop, adding about 5 to 10 hairpins to the hash every second so hopefully it should finish running after about 5 to 10 hours. That's just a guess because I can't leave my laptop running that long (it's on dining room table, no desk).

If it's still not fast enough, would you consider loading the data into a relational database, such as SQLite? That would require writing a Perl program to load the data into tables in the database, and then require significant changes to the program(s) that read the database and contain the logic to determine nearest gene, etc. SQLite is easy to install and use with Perl. see http://search.cpan.org/~msergeant/DBD-SQLite-0.31/lib/DBD/SQLite.pm

Meanwhile here's the latest, and hopefully the fastest, version of our Perl script that may take about 5 to 10 hours to produce an output file from those two text files you uploaded to fyels.com

#!/usr/bin/perl;
#comparepremiRNAtocoding-pulloverlap3.pl;
use strict;
use warnings;
use List::Util qw[min max];
use Benchmark;

my $t0 = Benchmark->new;

#You can delete the following line. My testing setup works better without command-line arguments
@ARGV = qw(cab-ALL.LU-premiRNAs.withLOCATION.Vmatch-Nvmr-EST.list bab-All.LuCoding.061511) if @ARGV == 0;

my $hairpin_filename = $ARGV[0];
my $gene_filename = $ARGV[1];
my @genedata;
$| = 1;#Flush print buffer

read_genedata();#Call subroutine to read selected records from file into array

my $t1 = Benchmark->new;
my $td = timediff($t1, $t0);
print "Reading gene file into array took:",timestr($td),"\n";

@genedata = sort by_scaffold @genedata;
my $t2 = Benchmark->new;
$td = timediff($t2, $t1);
print "Sorting the gene array took:",timestr($td),"\n";

my %h_hash = ();
read_hairpindata();
my $t3 = Benchmark->new;
$td = timediff($t2, $t0);
print "Building the hairpin-to-nearest-gene hash took:",timestr($td),"\n";

#We can access the data for each hairpin id in each scaffold location
#by looping through %h_hash
my $header = 'Hairpin_ID, hairpin other, outside/inside/overlap,'
               . 'gene ID, gene other, distance from hairpin, scaffold,'
               . 'hairpin start, hairpin stop, hairpin sequence';#Heading line
               
#Because we're printing debug info (time spent, record counts) to screen
#Let's open an output file for the hairpin-nearest-gene results
my $outfilename = 'hairpin-nearest-gene.txt';
open my $fh_output, '>', $outfilename or die "Failed to open $outfilename: $!";
print $outfilename $header, "\n";
        
foreach my $s(keys %h_hash){
    foreach my $h(keys %{$h_hash{$s}}){
        #dereference the genedata array
        my ($g_id, $g_scaffold, $g_start, $g_stop, $g_other, $dist) = @{$h_hash{$s}{$h}{nearest_gene_data}};
        
        #Dereference array reference and join array elements
        my $hairpin_other = join '', @{$h_hash{$s}{$h}{other}};
        my $hairpin_start = $h_hash{$s}{$h}{start};
        my $hairpin_stop = $h_hash{$s}{$h}{stop};
        my $hairpin_seq = $h_hash{$s}{$h}{sequence_read};
        
        if (defined $g_id){
            my $oio = calc_rel_pos($hairpin_start, $hairpin_stop, $g_start, $g_stop);
            print $outfilename "$h,$hairpin_other,$oio,$g_id,$g_other,$dist,$s,$hairpin_start,$hairpin_stop,$hairpin_seq\n";
        }
        else{
            print $outfilename "$h,$hairpin_other,,,,,$s,$hairpin_start,$hairpin_stop,$hairpin_seq\n";
        }
    }
}

sub read_genedata{
    #loop through genedata file and push required records into array
    my $count = 0;
    open(my $fh, '<', $gene_filename)||die "open $gene_filename failed: $!";
    while (<$fh>){
        next if m/^\s*$/;#Skip blank lines
        chomp;
        my ($g_id, $g_scaffold, $g_start, $g_stop, $g_other) = split /\s+/, $_, 6;
        next unless defined $g_other and $g_other eq 'gene'
                  or
                    defined $g_id and $g_id =~ m/^JG/
                    and !defined $g_other;#Read only 'gene' records
        $count++;
        print "$count records saved to genedata array\n";
        push @genedata, $_;
    }
}

sub read_hairpindata{
    open(my $fh, '<', $hairpin_filename)|| die "open $hairpin_filename failed: $!";
my $count;
    #Read entire hairpindata file into a hash
    while(<$fh>){
        chomp;
        my @cols = split/\t/;#split by tab
        my $scaffold_loc = $cols[1];#make column 2 (index 1) the scaffold location key (i.e. the scaffolds);
        my $hairpin_id = $cols[0];#hairpin_id also needs to be a key so you can look up data later
        $h_hash{$scaffold_loc}{$hairpin_id}{start} = $cols[2];
        $h_hash{$scaffold_loc}{$hairpin_id}{stop} = $cols[3];
        $h_hash{$scaffold_loc}{$hairpin_id}{other} = [@cols[4..14]];
        $h_hash{$scaffold_loc}{$hairpin_id}{sequence_read} = $cols[15];
        $h_hash{$scaffold_loc}{$hairpin_id}{nearest_gene_data} = nearest_gene($scaffold_loc, $hairpin_id);
        $count++;
        print "$count hairpin added to hash\n";
    }
}

sub nearest_gene{
    my ($s, $h) = @_; #scaffold location and hairpin_id received as arguments
    
    my $save_min = 999999;#Big number for comparing distances
    my @gene_rec;
    my @genes = grep {
        my ($g_id, $g_scaffold, $g_start, $g_stop, $g_other) = split /\s+/, $_, 6;
        $g_scaffold eq $s;
    } @genedata;
    
    foreach (@genes){
        chomp;
        my ($g_id, $g_scaffold, $g_start, $g_stop, $g_other) = split /\s+/, $_, 6;
        my $min = min( abs($h_hash{$s}{$h}{start} - $g_start),
                    abs($h_hash{$s}{$h}{stop} - $g_start),
                    abs($h_hash{$s}{$h}{start} - $g_stop),
                    abs($h_hash{$s}{$h}{stop} - $g_stop) );

        if ($min < $save_min){
            $save_min = $min;
            @gene_rec = ($g_id, $g_scaffold, $g_start, $g_stop, $g_other,$save_min);
        }
    }
    return \@gene_rec;
}

sub calc_rel_pos{
    my ($h_start, $h_stop, $g_start, $g_stop) = @_;
    
    if (is_between($h_start, $g_start, $g_stop)
        && is_between($h_stop, $g_start, $g_stop)){
        return 'inside';
    }
    
    if (!is_between($h_start, $g_start, $g_stop)
    && is_between($h_stop, $g_start, $g_stop)
    ||
    is_between($h_start, $g_start, $g_stop)
    && !is_between($h_stop, $g_start, $g_stop)){
    return 'overlap';
    }
    
    if (!is_between($h_start, $g_start, $g_stop)
    && !is_between($h_stop, $g_start, $g_stop)){
        return 'outside';
    }
}

sub is_between{
    my ($x, $lower_limit, $upper_limit) = @_;
    return ( $x >= $lower_limit && $x <= $upper_limit );
}

sub by_scaffold{
    my ($id_a, $scaffold_a) = split /\s+/, $a;
    my ($id_b, $scaffold_b) = split /\s+/, $b;
    $scaffold_a cmp $scaffold_b;
}