Hello friends, I'm trying to remove all tags from a Wikipedia entry, leaving a simple text file. I have downloaded an HTML file from Wikipedia and hunted through my program.
But the tags are not removed properly. Instead, only nonsense comes out of it. Where is my thinker?
Java:
import java.util.Scanner;
import java.io.File;
import java.io.BufferedWriter;
import java.io.FileWriter;
import java.io.PrintWriter;
import java.io.IOException;
import java.nio.file.Path;
import java.nio.file.Files;
import java.nio.file.Paths;
public class Reg {
public static void main(String[] args) throws Exception {
File file = new File("test.html");
Path path = Paths.get(args[0]);
byte[] raw = Files.readAllBytes(path);
String text = new String(raw, "UTF8");
text = text.replaceAll("<script.*>.*</script>", "");
text = text.replaceAll("<.*>", "");
text = text.replaceAll("</.*>", "");
PrintWriter output = new PrintWriter("test.txt");
output.print(text);
}
}