Skip to content Skip to sidebar Skip to footer

Jsoup : How To Parse Multiple HTML Files From Local Drive?

I've got multiple HTML files on my hdd to parse with Jsoup. I've been able to parse one file but not multiple files. I would like to parse all the files of a folder. I wrote this c

Solution 1:

Extract the code to parse html in a method; list the content of your directory and call parse for each file

   File input = new File("C:/html");
   File[] st = input.listFiles();
   for (int i = 0; i < st.length; i++) {
          if(st[i].isFile()){//other condition like name ends in html
                 parse(st[i]);
          }
   }

so your code should look like this:

import java.io.File;
import java.io.IOException;

import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;

public class Main {

    public static void main(String[] args) {
        File input = new File("C:/html");
        File[] st = input.listFiles();
        for (int i = 0; i < st.length; i++) {
            if(st[i].isFile()){//other condition like name ends in html
                parse(st[i]);
            }
        }

    }

    private static void parse(File input ) {
        Document doc;

        try{

            doc = Jsoup.parse(input, "UTF-8", "");


            Elements ids = doc.select("div[id^=desk] p");

            for (Element id : ids){

                System.out.println("\n"+id.text());

            }

        }catch(IOException e){

        }
    }
}

Solution 2:

I have written program to read folder and inner folder for given path and write results into csv

import java.io.File;
import java.io.FileWriter;
import java.io.IOException;

import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;

public class fixingCode {

    public static void main(String[] args) {
        FileWriter writer = null;

        System.out.println("--------------------------Program started--------------------------");

        File input = new File(
                "C:\\My Web Sites\\\\library\\math");//reading file from parent folder 

        try {
            writer = new FileWriter("c:\\Temp\\results.csv");//writing file on path
            Process(input, writer);

        } catch (IOException e) {
            // TODO Auto-generated catch block
            e.printStackTrace();
        } finally {

            try {

                writer.flush();
                writer.close();
            } catch (IOException e) {
                e.printStackTrace();
            }
        }

        //

        System.out.println("--------------------------Program End--------------------------");
    }

    static int spc_count = -1;

    static void Process(File aFile, FileWriter writer) {
        spc_count++;
        Document doc = null;
        String spcs = "";

        try {

            //
            for (int i = 0; i < spc_count; i++)
                spcs += " ";
            if (aFile.isFile()) {
                System.out.println(spcs + "[FILE] " + aFile.getName());
            } else if (aFile.isDirectory()) {
                //
                System.out.println(spcs + "[DIR] " + aFile.getName());
                //
                File[] listOfFiles = aFile.listFiles();
                //
                File[] st = listOfFiles;

                //
                for (int i = 0; i < st.length; i++) {
                    if (st[i].isFile()) {// other condition like name
                                            // ends in

                        doc = Jsoup.parse(st[i], null);

                        // get page title
                        String title = doc.title();
                        System.out.println("title : " + "[" + i + "]" + title);
                        //
                        String ownText = doc.body().ownText();
                        String text = doc.body().text();
                        //
                        // System.out.println("ownText" + ownText + "\n");
                        System.out.println("text" + text);
                        //

                        writer.append("title : " + "[" + i + "]");
                        writer.append(',');
                        writer.append(title);
                        writer.append('\n');

                        /*
                         * writer.append("ownText"); writer.append(',');
                         * writer.append(ownText); writer.append('\n');
                         */

                        writer.append("text : " + "[" + i + "]");
                        writer.append(',');
                        writer.append(text);
                        writer.append('\n');
                    }
                    //
                    //
                    if (listOfFiles != null) {
                        //
                        for (int j = 0; j < listOfFiles.length; j++)
                            Process(listOfFiles[j], writer);
                    } else {
                        System.out.println(spcs + " [ACCESS DENIED]");
                    }
                }
            }
        } catch (Exception e) {
            e.printStackTrace();
        }

        spc_count--;
    }

}

Post a Comment for "Jsoup : How To Parse Multiple HTML Files From Local Drive?"