Hi everyone,
I am using this code to parse data from latex files :
#!/Applications/xampp/xamppfiles/bin/perl
use strict;
use File::Copy;
use LaTex::Parser;
use Text::CSV_XS;
use utf8;
use Unicode::String;
Unicode::String->stringify_as( 'utf8' ); # utf8 already is the default
#see Ch 7.1 of Data Munging with Perl
my $output = "ID+add.txt";
open (LAT_LON_OUT_FILE, "> $output") or die "Couldn't open $output for writing: $!\n";
binmode LAT_LON_OUT_FILE, ":utf8";
#This routine is supposed to work in the dir where the .tar.gz files are
#It extracts each .tar.gz file into a temp directory
my $tdir = 'temp';
mkdir $tdir;
my @flist=<*.tar.gz>;
chdir $tdir;
my $not_processed = 0; #Number of files that the script cannot process
my $number_of_papers = 0;
my $file_cat = " ";
#Go through all the .tar.gz files in current dir
foreach my $item (@flist)
{
unlink glob "*.*"; #erase all files in dir $tdir
#go through .tar.gz files and uncompress them
copy "../$item", "$item"; #copy .gz file to temp dir ...
print"$item\n";
system "gzip -dnv $item"; #ungzip ...
my $ftar = substr($item,0,-3); #remove extension .gz ...
print"$ftar\n";
my $tar_error = system "tar -xm -f $ftar"; #and untar
#untar may not work if archive has only one file. In that case
#gzip gave already a tex file -just need to rename it.
if ($tar_error)
{
rename "$ftar", substr($ftar,0,-4).".tex";
}
else
{
unlink $ftar; #tar file needs to be deleted, otherwise it will be in @textlist
}
# Let's extract the data of the corresponding ABS file first..
my $file_ID = substr($ftar,0,-4);
my $input_file = "../". $file_ID . ".abs";
open (MYINPUTFILE, "$input_file") or die "Couldn't open $input_file for reading: $!\n"; # open for input
my $file_content = "FILE: \n";
while ( <MYINPUTFILE> )
{
$file_content = $file_content . $_;
}
close(MYINPUTFILE);
#$file_content =~ s/\n/\t/g; #to make more clear where are the separators..
# print "$file_content\n";sleep 5;
$file_content =~ /\nDate:\s\S+,\s([^\n]+)\s\s\(/;
my $file_date = $1;
$file_content =~ /\nTitle:\s([^\n]+)/;
my $file_tit = $1;
$file_content =~ /\nAuthors:\s([^\n]+)/;
my $file_auth = $1;
$file_content =~ /\nCategories:\s([^\n]+)/;
$file_cat = $1;
#process TeX files. Note that some of the files had extension .tex whereas others had .latex or .txt
#note that there is a problem in looking just for <*.*t*> -you also get .sty files, which are style files!
my @texlist;
my @texlist1 = <*.latex>;
my @texlist2 = <*.tex>;
my @texlist3 = <*.txt>;
push(@texlist, @texlist1);
push(@texlist, @texlist2);
push(@texlist, @texlist3);
my $valid_zip = 0; #Is zip code valid?
my @author; #Array with authors/addresses
my $author_index = 0; #Index into @author
my $aux1 = 0; #American-like post code detected?
my $myflag = 0;
print STDERR "test100......................\n";
#Go through .TeX files in dir $tdir and assign @author
foreach my $texfile (@texlist)
{
#print LAT_LON_OUT_FILE "$ftar \n";
my $l = new LaTeX::Parser 'file' => $texfile;
printf("aaaa %s\n", $texfile);
print STDERR "test100......................\n";
my $p = $l->latex;
print STDERR "test100......................\n";
my $next_field = 0; #if $next_field=1 then next field is author; $next_field=2 then next field is address
my $ambiguous_field = 1; #sometimes the author and address are ambiguous. Detect these.
#Go through TeX fields in files and assign array @author
for (my $i=0; $i<=$#{$p}; $i++)
{
my $authorfield = $p->[$i]; #contains LaTeX fields
#Some author/address fields are empty others don't have a LateX field '{' -skip them
next if (($next_field == 1 || $next_field == 2) && (!($authorfield =~ /\w/) || !($authorfield =~ /\{/)));
#Assign author field]
if ($next_field == 1 || ($authorfield =~ /\\large/i) || ($authorfield =~ /\\small/i))
{
$author[$author_index] = "author: ".$authorfield;
$next_field = 0;
$author_index++;
$ambiguous_field = 0; #We have detected an author field, so file is unambiguous
}
#Assign address field
if ($next_field == 2)
{
$author[$author_index] = "address: ".$authorfield;
$next_field = 0;
$author_index++;
$ambiguous_field = 0; #We have detected an address field, so file is unambiguous
}
#next field to be read will be the author
$next_field = 1 if (($authorfield =~ /\\aut(?:\w)+/) || ($authorfield =~ /\\large/i) || ($authorfield =~ /\\center/i) || ($authorfield =~ /\\small/i));
#next field to be read will be the address
$next_field = 2 if ($authorfield =~ /\\add(?:\w)+/) || ($authorfield =~/\\affi(?:\w)+/) || ($authorfield =~ /\\inst(?:\w)*/) || (($authorfield =~/\{\d\}/) and ($myflag == 1));
#two fields after will be an address
if ($authorfield =~/\\altaffiltext\s?/) { $myflag = 1;}
#if (($authorfield =~/\{\d\}/) and ($myflag == 1)){ $myflag = 2;}
if (($authorfield =~/\{([^=]+,+[^=]+)\}/) and ($myflag == 1)) {$myflag = 0;}
#we are only interested in the header of the paper, so if you find abstract
#and you have found author/address then end loop
last if ($authorfield =~ /^\{abst(?:\w)+/ && !$ambiguous_field);
}
#Skip files that do not have author/address
next if ($ambiguous_field);
#Check zip codes
my $author_only = 0;
$aux1 = 0; my $aux2 = 0;
$valid_zip = 0;
foreach my $adr (@author)
{
#Look for zip code if field is address or if there is no address field in the @author array
#(in which case the address will be in the author field)
if ($adr =~ /address/ || !(grep {/address/} @author))
{
print STDERR "test5......................\n";
#$adr =~ /address: \{([^=]+,+[^=]+)\}/;
$adr =~ /address: \{([^=]+,+[^=]+)\}/;
print STDERR "test6......................\n";
if($adr =~ /address: \{([^=]+,+[^=]+)\}/) {
$adr = $1;
print STDERR "test6......................\n";
$adr =~ s/\n/ /g;
#$adr =~ s/\\/ /g;
$adr =~ s/{//g;
$adr =~ s/}//g;
$adr =~ s/\.$//g;
$adr =~ s/\s+/ /g;
$adr =~ s/\\+\s*$//g;
print STDERR "test4......................\n";
#Let's check multiple adresses in a row..
#if ( $adr =~ /\$\^\S+\$/){
# These commands transform numbered separators into \instA. Further versions of the code could extract author-address relationship
$adr =~ s/\$\^?\S+\s?\S*\$/\\instA/g;
$adr =~ s/\\+\s?\d\./\\instA/g;
$adr =~ s/\\+\s?\d\-/\\instA/g;
$adr =~ s/\\+\s?\S\)/\\instA/g;
$adr =~ s/\\+\s?\d\)/\\instA/g;
print STDERR "test3......................\n";
#These ones transform different usages of \and into \instA
$adr =~ s/\\+\s?and\s?/\\instA/g;
$adr =~ s/,?\sand\s?\\+/\\instA/g;
$adr =~ s/\\newline/\\instA/g;
$adr =~ s/;\s\\+/\\instA/g;
$adr =~ s/,\\+/\\instA/g;
$adr =~ s/\\medskip/\\instA/g;
if ( $adr=~/(\\inst)/ ){ # Separates multiple addresses linked by a \inst
my $pos = rindex($adr, "\\inst");
while ($pos > 0){
my $resul = substr($adr,$pos+6);
$resul =~ s/,?\s*\\\*s*\\\*s*\\*$//g;
$resul =~ s/^\\,\s//g;
$resul =~ s/(email|E-mail|e-mail):?\s*[^\s]+//g;
$resul =~ s/[^\s]+\@[^\=]+//g;
$resul =~ s/\\S+$//g;
$resul =~ s/\\\s\\[^=]+$//g;
$resul =~ s/\\+\s?\[\\affilskip\]\s*$//g;
$resul =~ s/\,\s?$//g;
$resul =~ s/\.$//g;
$resul =~ s/\;$//g;
$resul =~ s/\;\s+\S+\.\S+\.?\S*$//g;
#$resul =~ s/\\+\s+\S+\.\S+\.?\S*$//g;
$resul =~ s/\\+\s*$//g;
if (length($resul) > 10) {print LAT_LON_OUT_FILE "$file_ID\t$file_cat\t$resul\n";}
$adr = substr($adr,0,$pos);
$pos = rindex($adr, "\\inst", $pos+6);
if (($pos == -1) and (length($adr) > 10)) { #there is no \inst at the beginning. Print the remaining of the string and leave.
$adr =~ s/(email|E-mail|e-mail):?\s*[^\s]+//g;
$adr =~ s/[^\s]+\@[^\=]+//g;
$adr =~ s/\\+\s?\[\\affilskip\]\s*$//g;
$adr =~ s/\\S+$//g;
$adr =~ s/\\\s\\[^=]+$//g;
$adr =~ s/\,\s?$//g;
$adr =~ s/\.$//g;
$adr =~ s/\;$//g;
$adr =~ s/\;\s+\S+\.\S+\.?\S*$//g;
#$adr =~ s/\\+\s+\S+\.\S+\.?\S*$//g;
$adr =~ s/\\+\s*$//g;
print LAT_LON_OUT_FILE "$file_ID\t$file_cat\t$adr\n";
}
}
}
else {
$adr =~ s/(email|E-mail|e-mail):?\s*[^\s]+//g;
$adr =~ s/[^\s]+\@[^\=]+//g;
$adr =~ s/\\S+$//g;
$adr =~ s/\\\s\\[^\=]+$//g;
$adr =~ s/\\+\s?\[\\affilskip\]\s*$//g;
$adr =~ s/\,\s?$//g;
$adr =~ s/\.$//g;
$adr =~ s/\;\s+\S+\.\S+\.?\S*$//g;
#$adr =~ s/\\+\s+\S+\.\S+\.?\S*$//g;
$adr =~ s/\\+\s*$//g;
print LAT_LON_OUT_FILE "$file_ID\t$file_cat\t$adr\n";
}
}
}
}
}
}
close LAT_LON_OUT_FILE;
But I am getting an error...like this:
A \ Command I don't understand at /Applications/XAMPP/xamppfiles/lib/perl5/site_perl/5.10.0/LaTex/Parser.pm line 115.
Can sumbody sugest what could be the problem and the solution to it,
I am using XAMPP server, perl 5.10.1 and LaTex:: Parser module to do this. Thanks