I'm testing lucene to write an application. In this testing, I use Tika to extract documents. When I use Tika, I meet this errors. Please explain me.
Errors are as follow.
Exception in thread "main" org.apache.tika.exception.TikaException: Unexpected RuntimeException from org.apache.tika.parser.microsoft.ooxml.OOXMLParser@7541f8
at org.apache.tika.parser.CompositeParser.parse(CompositeParser.java:122)
at org.apache.tika.parser.AutoDetectParser.parse(AutoDetectParser.java:101)
at touchLucene.SuckClass.getDocument(SuckClass.java:79)
at touchLucene.SuckClass.indexFile(SuckClass.java:117)
at touchLucene.SuckClass.index(SuckClass.java:62)
at touchLucene.SuckClass.main(SuckClass.java:141)
Caused by: java.lang.NullPointerException
at org.apache.tika.parser.microsoft.ooxml.XWPFWordExtractorDecorator.extractHeaders(XWPFWordExtractorDecorator.java:113)
at org.apache.tika.parser.microsoft.ooxml.XWPFWordExtractorDecorator.buildXHTML(XWPFWordExtractorDecorator.java:55)
at org.apache.tika.parser.microsoft.ooxml.AbstractOOXMLExtractor.getXHTML(AbstractOOXMLExtractor.java:69)
at org.apache.tika.parser.microsoft.ooxml.OOXMLParser.parse(OOXMLParser.java:51)
at org.apache.tika.parser.CompositeParser.parse(CompositeParser.java:120)
... 5 more
Here is my code.
package touchLucene;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.FileReader;
import java.io.IOException;
import java.io.InputStream;
import java.util.ArrayList;
import java.util.Collections;
import java.util.Date;
import java.util.HashSet;
import java.util.Iterator;
import java.util.List;
import java.util.Set;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.index.CorruptIndexException;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;
import org.apache.lucene.util.Version;
import org.apache.tika.config.TikaConfig;
import org.apache.tika.exception.TikaException;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.parser.AutoDetectParser;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.parser.Parser;
import org.apache.tika.sax.BodyContentHandler;
import org.xml.sax.ContentHandler;
import org.xml.sax.SAXException;
public class SuckClass {
private boolean DEBUG = false;
private IndexWriter writer;
static Set<String> textualMetadataFields = new HashSet<String>();
static{
textualMetadataFields.add(Metadata.TITLE);
textualMetadataFields.add(Metadata.AUTHOR);
textualMetadataFields.add(Metadata.COMMENTS);
textualMetadataFields.add(Metadata.KEYWORDS);
textualMetadataFields.add(Metadata.DESCRIPTION);
textualMetadataFields.add(Metadata.SUBJECT);
}
public SuckClass(String indexDir) throws IOException{
Directory dir = FSDirectory.open(new File(indexDir));
writer = new IndexWriter(dir, new StandardAnalyzer(Version.LUCENE_30), true, IndexWriter.MaxFieldLength.UNLIMITED);
}
public int index(String dataDir) throws IOException, SAXException, TikaException{
File[] files = new File(dataDir).listFiles();
for(File f : files){
if(!f.isDirectory() && !f.isHidden() && f.exists() && f.canRead()){
indexFile(f);
}
}
return writer.numDocs();
}
protected Document getDocument(File f) throws IOException, SAXException, TikaException{
Metadata metadata = new Metadata();
metadata.set(Metadata.RESOURCE_NAME_KEY, f.getName());
InputStream is = new FileInputStream(f);
Parser parser = new AutoDetectParser();
ContentHandler handler = new BodyContentHandler();
ParseContext context = new ParseContext();
context.set(Parser.class, parser);
try{
parser.parse(is, handler, metadata, new ParseContext());
}finally{
is.close();
}
Document doc = new Document();
doc.add(new Field("contents", handler.toString(), Field.Store.NO, Field.Index.ANALYZED));
if(DEBUG){
System.out.println(" all text : " + handler.toString());
}
for(String name : metadata.names()){
String value = metadata.get(name);
if(textualMetadataFields.contains(name)){
doc.add(new Field("contents", value, Field.Store.NO, Field.Index.ANALYZED));
}
doc.add(new Field(name, value, Field.Store.YES, Field.Index.NO));
if(DEBUG){
System.out.println("Hello I'm from second debug :" + name + " : " + value);
}
}
if(DEBUG){
System.out.println("Hi I'm from third debug.");
}
doc.add(new Field("fieldname", f.getCanonicalPath(), Field.Store.YES, Field.Index.NOT_ANALYZED));
return doc;
}
void indexFile(File f) throws IOException, SAXException, TikaException{
System.out.println("Now indexing : " + f.getCanonicalPath());
Document doc = getDocument(f);
}
public void close() throws CorruptIndexException, IOException{
writer.close();
}
public static void main(String[] args) throws IOException, SAXException, TikaException{
String indexDir = "E:/ToTest/index";
String dataDir = "E:/ToTest/doc";
TikaConfig config = TikaConfig.getDefaultConfig();
List<String> parsers = new ArrayList<String>(config.getParsers().keySet());
Collections.sort(parsers);
Iterator<String> it = parsers.iterator();
while(it.hasNext()){
System.out.println("This is form hasNext method : " + it.next());
}
System.out.println("----------------XXX-----------------");
long start = new Date().getTime();
SuckClass suck = new SuckClass(indexDir);
int numIndexed = suck.index(dataDir);
suck.close();
long end = new Date().getTime();
System.out.println(numIndexed + " were indexed in " + (end - start) + " milliseconds.");
}
}
Please help me.