I must find the restriction maps in a FASTA file and then for every sequence print out a graphical display of the cut sites in the restriction map by printing the sequence and labeling the recognition sites with the enzyme name. I also must use a hash to store the data. I must print out all graphical displays of every sequence.
This is what I have:
#!/usr/bin/perl
# Declare and initialize variables
my %rebase_hash = ();
unless(dbmopen(%rebase_hash, 'DBNAME', 0644)) {
print "Cannot open DBM file DBNAME with mode 0644\n";
}
my @file_data = ();
my $query = '';
my $dna = '';
my $recognition_site = '';
my $regexp = '';
my @locations = ();
# If there is a command-line argument, assume it's DNA
if(@ARGV) {
$dna = $ARGV[0];
# Otherwise, prompt for a FASTA file to open
}else{
print "Input a FASTA filename: ";
my $filename = <STDIN>;
chomp $filename;
unless(-e $filename) {
print "$filename does not exist!\n";
exit;
}
# Read in the file
@file_data = get_file_data($filename);
# Extract the DNA sequence data from the contents of the FASTA file
$dna = extract_sequence_from_fasta_data(@file_d…
}
# Get the REBASE data into a hash, from file "DNASIS Rebase"
%rebase_hash = parseREBASE('DNASIS.txt');
# Prompt user for restriction enzyme names, create restriction map
do {
print "Search for what restriction site (or quit)?: ";
$query = <STDIN>;
chomp $query;
# Exit if empty query
if ($query =~ /^\s*$/ ) {
exit;
}
# Perform the search in the DNA sequence
if ( exists $rebase_hash{$query} ) {
($recognition_site, $regexp) = split ( " ", $rebase_hash{$query});
# Create the restriction map
@locations = match_positions($regexp, $dna);
# Report the restriction map to the user
if (@locations) {
print "Searching for $query $recognition_site $regexp\n";
print "A restriction site for $query at locations:\n";
print join(" ", @locations), "\n";
} else {
print "A restriction enzyme $query is not in the DNA:\n";
}
}
print "\n";
} until ( $query =~ /quit/ );
exit;
######################################…
# Subroutines
######################################…
######################################…
# A subroutine to get data from a file given its filename
# get_file_data
sub get_file_data {
my($filename) = @_;
use strict;
use warnings;
# Initialize variables
my @filedata = ( );
unless( open(GET_FILE_DATA, $filename) ) {
print STDERR "Cannot open file \"$filename\"\n\n";
exit;
}
@filedata = <GET_FILE_DATA>;
close GET_FILE_DATA;
return @filedata;
}
# a subroutine to extract FASTA sequence data from an array
sub extract_sequence_from_fasta_data {
my(@fasta_file_data) = @_;
use strict;
use warnings;
# Declare and initialize variables
my $sequence = '';
foreach my $line (@fasta_file_data) {
# discard blank line
if ($line =~ /^\s*$/) {
next;
# discard comment line
} elsif($line =~ /^\s*#/) {
next;
# discard fasta header line
} elsif($line =~ /^>/) {
next;
# keep line, add to sequence string
} else {
$sequence .= $line;
}
}
# remove non-sequence data (in this case, whitespace) from $sequence string
$sequence =~ s/\s//g;
return $sequence;
}
# A subroutine to return a hash where
# key = restriction enzyme name
# value = whitespace-separated recognition site and regular expression
sub parseREBASE {
my($rebasefile) = @_;
use strict;
use warnings;
# Declare variables
my @rebasefile = ( );
my %rebase_hash = ( );
my $name;
my $site;
my $regexp;
# Read in the REBASE file
my $rebase_filehandle = open_file($rebasefile);
while(<$rebase_filehandle>) {
# Discard header lines
( 1 .. /Rich Roberts/ ) and next;
# Discard blank lines
/^\s*$/ and next;
# Split the two (or three if includes parenthesized name) fields
my @fields = split( " ", $_);
# Get and store the name and the recognition site
# Remove parenthesized names, for simplicity's sake,
# by not saving the middle field, if any,
# just the first and last
$name = shift @fields;
$site = pop @fields;
# Translate the recognition sites to regular expressions
$regexp = IUB_to_regexp($site);
# Store the data into the hash
$rebase_hash{$name} = "$site $regexp";
}