I am trying to extract text from PDF file in PDFNetSDK by PDFTron. But when the execution comes to dumpAlltext() there is no element type e_text. When I am debuggin the code i can see element type is e_path..I think e_path is having the all the text in it. How can I extract the text from e_path??..Please help.
if sys.version_info.major < 3:
from PDFNetPython2 import *
else:
from PDFNetPython3 import *
def printStyle (style):
print(" style=\"font-family:" + style.GetFontName() + "; font-size:"
+ str(style.GetFontSize()) + "; sans-serif: " + str(style.IsSerif())
+ "; color:" + str(style.GetColor())+ "\"")
def dumpAllText (reader):
element = reader.Next()
while element != None:
type = element.GetType()
if type == Element.e_text_begin:
print("Text Block Begin")
elif type == Element.e_text_end:
print("Text Block End")
elif type == Element.e_text:
bbox = element.GetBBox()
print("BBox: " + str(bbox.GetX1()) + ", " + str(bbox.GetY1()) + ", "
+ str(bbox.GetX2()) + ", " + str(bbox.GetY2()))
print(element.GetTextString())
elif type == Element.e_text_new_line:
print("New Line")
elif type == Element.e_form:
reader.FormBegin()
dumpAllText(reader)
reader.End()
elif type == Element.e_group_begin:
print("Group begins")
elif type == Element.e_group_end:
print("Group ends")
elif type == Element.e_path:
bbox = element.GetBBox()
print("BBox: " + str(bbox.GetX1()) + ", " + str(bbox.GetY1()) + ", "
+ str(bbox.GetX2()) + ", " + str(bbox.GetY2()))
print(element.GetTextData())
element = reader.Next()
def main():
PDFNet.Initialize()
# Relative path to the folder containing test files.
input_path = "test.pdf"
example5_low_level = True
if example5_low_level:
doc = PDFDoc(input_path)
doc.InitSecurityHandler()
# Example 1. Extract all text content from the document
reader = ElementReader()
itr = doc.GetPageIterator()
while itr.HasNext():
reader.Begin(itr.Current())
dumpAllText(reader)
reader.End()
itr.Next()
if __name__ == '__main__':
main()