Jsoup : How To Parse Multiple HTML Files From Local Drive?
I've got multiple HTML files on my hdd to parse with Jsoup. I've been able to parse one file but not multiple files. I would like to parse all the files of a folder. I wrote this c
Solution 1:
Extract the code to parse html in a method; list the content of your directory and call parse for each file
File input = new File("C:/html");
File[] st = input.listFiles();
for (int i = 0; i < st.length; i++) {
if(st[i].isFile()){//other condition like name ends in html
parse(st[i]);
}
}
so your code should look like this:
import java.io.File;
import java.io.IOException;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
public class Main {
public static void main(String[] args) {
File input = new File("C:/html");
File[] st = input.listFiles();
for (int i = 0; i < st.length; i++) {
if(st[i].isFile()){//other condition like name ends in html
parse(st[i]);
}
}
}
private static void parse(File input ) {
Document doc;
try{
doc = Jsoup.parse(input, "UTF-8", "");
Elements ids = doc.select("div[id^=desk] p");
for (Element id : ids){
System.out.println("\n"+id.text());
}
}catch(IOException e){
}
}
}
Solution 2:
I have written program to read folder and inner folder for given path and write results into csv
import java.io.File;
import java.io.FileWriter;
import java.io.IOException;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
public class fixingCode {
public static void main(String[] args) {
FileWriter writer = null;
System.out.println("--------------------------Program started--------------------------");
File input = new File(
"C:\\My Web Sites\\\\library\\math");//reading file from parent folder
try {
writer = new FileWriter("c:\\Temp\\results.csv");//writing file on path
Process(input, writer);
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
} finally {
try {
writer.flush();
writer.close();
} catch (IOException e) {
e.printStackTrace();
}
}
//
System.out.println("--------------------------Program End--------------------------");
}
static int spc_count = -1;
static void Process(File aFile, FileWriter writer) {
spc_count++;
Document doc = null;
String spcs = "";
try {
//
for (int i = 0; i < spc_count; i++)
spcs += " ";
if (aFile.isFile()) {
System.out.println(spcs + "[FILE] " + aFile.getName());
} else if (aFile.isDirectory()) {
//
System.out.println(spcs + "[DIR] " + aFile.getName());
//
File[] listOfFiles = aFile.listFiles();
//
File[] st = listOfFiles;
//
for (int i = 0; i < st.length; i++) {
if (st[i].isFile()) {// other condition like name
// ends in
doc = Jsoup.parse(st[i], null);
// get page title
String title = doc.title();
System.out.println("title : " + "[" + i + "]" + title);
//
String ownText = doc.body().ownText();
String text = doc.body().text();
//
// System.out.println("ownText" + ownText + "\n");
System.out.println("text" + text);
//
writer.append("title : " + "[" + i + "]");
writer.append(',');
writer.append(title);
writer.append('\n');
/*
* writer.append("ownText"); writer.append(',');
* writer.append(ownText); writer.append('\n');
*/
writer.append("text : " + "[" + i + "]");
writer.append(',');
writer.append(text);
writer.append('\n');
}
//
//
if (listOfFiles != null) {
//
for (int j = 0; j < listOfFiles.length; j++)
Process(listOfFiles[j], writer);
} else {
System.out.println(spcs + " [ACCESS DENIED]");
}
}
}
} catch (Exception e) {
e.printStackTrace();
}
spc_count--;
}
}
Post a Comment for "Jsoup : How To Parse Multiple HTML Files From Local Drive?"