I am trying to extract the number in the string "(c) 2010 Elsevier Ltd" from a PDF document . I found that the textsearch method of PDFtron package would help me find the string as stated in the example code:
import pdftron.Common.PDFNetException;
import pdftron.PDF.*;
import pdftron.SDF.SDFDoc;
// This sample illustrates the basic text search capabilities of PDFNet.
public class TextSearchTest
{
public static void main(String[] args)
{
PDFNet.initialize();
String input_path = "../../TestFiles/";
try
{
PDFDoc doc = new PDFDoc(input_path + "credit card numbers.pdf");
doc.initSecurityHandler();
TextSearch txt_search = new TextSearch();
int mode = TextSearch.e_whole_word | TextSearch.e_page_stop;
String pattern = "joHn sMiTh";
//call Begin() method to initialize the text search.
txt_search.begin( doc, pattern, mode, -1, -1 );
int step = 0;
//call Run() method iteratively to find all matching instances.
while ( true )
{
TextSearchResult result = txt_search.run();
if ( result.getCode() == TextSearchResult.e_found )
{
if ( step == 0 )
{
//step 0: found "John Smith"
//note that, here, 'ambient_string' and 'hlts' are not written to,
//as 'e_ambient_string' and 'e_highlight' are not set.
System.out.println(result.getResultStr() + "'s credit card number is:");
//now switch to using regular expressions to find John's credit card number
mode = txt_search.getMode();
mode |= TextSearch.e_reg_expression | TextSearch.e_highlight;
txt_search.setMode(mode);
String new_pattern = "\\d{4}-\\d{4}-\\d{4}-\\d{4}"; //or "(\\d{4}-){3}\\d{4}"
txt_search.setPattern(new_pattern);
step = step + 1;
}
else if ( step == 1 )
{
//step 1: found John's credit card number
System.out.println(" " + result.getResultStr());
//note that, here, 'hlts' is written to, as 'e_highlight' has been set.
//output the highlight info of the credit card number
Highlights hlts = result.getHighlights();
hlts.begin(doc);
while ( hlts.hasNext() )
{
System.out.println("The current highlight is from page: " + hlts.getCurrentPageNumber());
hlts.next();
}
//see if there is an AMEX card number
String new_pattern = "\\d{4}-\\d{6}-\\d{5}";
txt_search.setPattern(new_pattern);
step = step + 1;
}
else if ( step == 2 )
{
//found an AMEX card number
System.out.println("\nThere is an AMEX card number: ");
System.out.println(" " + result.getResultStr());
//change mode to find the owner of the credit card; supposedly, the owner's
//name proceeds the number
mode = txt_search.getMode();
mode |= TextSearch.e_search_up;
txt_search.setMode(mode);
String new_pattern = "[A-z]++ [A-z]++";
txt_search.setPattern(new_pattern);
step = step + 1;
}
else if ( step == 3 )
{
//found the owner's name of the AMEX card
System.out.println("Is the owner's name:");
System.out.println(" " + result.getResultStr() + "?");
//add a link annotation based on the location of the found instance
Highlights hlts = result.getHighlights();
hlts.begin(doc);
while ( hlts.hasNext() )
{
Page cur_page= doc.getPage(hlts.getCurrentPageNumber());
double[] q = hlts.getCurrentQuads();
int quad_count = q.length/8;
for ( int i = 0; i < quad_count; ++i )
{
//assume each quad is an axis-aligned rectangle
int offset = 8*i;
double x1 = Math.min(Math.min(Math.min(q[offset+0], q[offset+2]), q[offset+4]), q[offset+6]);
double x2 = Math.max(Math.max(Math.max(q[offset+0], q[offset+2]), q[offset+4]), q[offset+6]);
double y1 = Math.min(Math.min(Math.min(q[offset+1], q[offset+3]), q[offset+5]), q[offset+7]);
double y2 = Math.max(Math.max(Math.max(q[offset+1], q[offset+3]), q[offset+5]), q[offset+7]);
pdftron.PDF.Annots.Link hyper_link = pdftron.PDF.Annots.Link.create(doc, new Rect(x1, y1, x2, y2), Action.createURI(doc, "http://www.pdftron.com"));
cur_page.annotPushBack(hyper_link);
}
hlts.next();
}
String output_path = "../../TestFiles/Output/";
doc.save((output_path + "credit card numbers_linked.pdf"), SDFDoc.e_linearized, null);
break;
}
}
else if ( result.getCode() == TextSearchResult.e_page )
{
//you can update your UI here, if needed
}
else
{
break;
}
}
doc.close();
}
catch (PDFNetException e)
{
System.out.println(e);
}
PDFNet.terminate();
}
}
Is there any way by which i can get the number alone from the string stored in separate file?