Ekstraksi text dari HTML Menggunakan HTMLEditorKit dan JEditorPane

Using javax.swing.text.html.HTMLEditorKit

Kalau sebelumnya adalah menggunakan regexp, maka sekarang menggunakan HTMLEditorKit, sebuah library dari java.
Pada banyak kasus, HTMLEditorKit digunakan dengan JEditorPane, sebuah text component, tapi bisa juga digunakan langsung untuk ekstraksi text dari HTML page. Ini source code yang saya daptkan dari sebuah link..

import java.io.IOException;
import java.io.FileReader;
import java.io.Reader;
import java.util.List;
import java.util.ArrayList;

import javax.swing.text.html.parser.ParserDelegator;
import javax.swing.text.html.HTMLEditorKit.ParserCallback;
import javax.swing.text.html.HTML.Tag;
import javax.swing.text.MutableAttributeSet;

public class HTMLUtils {
  private HTMLUtils() {}
  
  public static List<String> extractText(Reader reader) throws IOException {
    final ArrayList<String> list = new ArrayList<String>();
    
    ParserDelegator parserDelegator = new ParserDelegator();
    ParserCallback parserCallback = new ParserCallback() {
      public void handleText(final char[] data, final int pos) { 
        list.add(new String(data));
      }
      public void handleStartTag(Tag tag, MutableAttributeSet attribute, int pos) { }
      public void handleEndTag(Tag t, final int pos) {  }
      public void handleSimpleTag(Tag t, MutableAttributeSet a, final int pos) { }
      public void handleComment(final char[] data, final int pos) { }
      public void handleError(final java.lang.String errMsg, final int pos) { }
    };
    parserDelegator.parse(reader, parserCallback, true);
    return list;
  }
  
  public final static void main(String[] args) throws Exception{
    FileReader reader = new FileReader("java-new.html");
    List<String> lines = HTMLUtils.extractText(reader);
    for (String line : lines) {
      System.out.println(line);
    }
  }
}

Lalu, saya mengimplementasikan ke dalam program saya.. inih dia..

package ekstraksi;

/**
 *
 * @author aharjunadhi
 */
import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.FileReader;
import java.io.FileWriter;
import java.io.InputStreamReader;
import java.io.Reader;
import java.util.List;
import java.util.ArrayList;
import java.util.logging.Level;
import java.util.logging.Logger;

import javax.swing.text.html.parser.ParserDelegator;
import javax.swing.text.html.HTMLEditorKit.ParserCallback;
import javax.swing.text.html.HTML.Tag;
import javax.swing.text.MutableAttributeSet;

public class HTMLUtils {
  private HTMLUtils() {}

  public static List<String> extractText(Reader reader) throws IOException {
    final ArrayList<String> list = new ArrayList<String>();

    ParserDelegator parserDelegator = new ParserDelegator();
    ParserCallback parserCallback = new ParserCallback() {
            @Override
      public void handleText(final char[] data, final int pos) {
        list.add(new String(data));
      }
            @Override
      public void handleStartTag(Tag tag, MutableAttributeSet attribute, int pos) { }
            @Override
      public void handleEndTag(Tag t, final int pos) {  }
            @Override
      public void handleSimpleTag(Tag t, MutableAttributeSet a, final int pos) { }
            @Override
      public void handleComment(final char[] data, final int pos) { }
            @Override
      public void handleError(final java.lang.String errMsg, final int pos) { }
    };
    parserDelegator.parse(reader, parserCallback, true);
    return list;
  }
  
  public void SatuBarisSatuKalimat() throws IOException {
      String  line = "", temp3 = "", path2 = "/home/aharjunadhi/NetBeansProjects/Ekstraksi/sourcefile/";
      /*BufferedReader ags;
        try {
            ags = new BufferedReader(new FileReader(path2 + "corpus.txt"));
            while ((temp3 = ags.readLine())!= null){

                line += temp3+"\n";
            }
        } catch (FileNotFoundException ex) {
            Logger.getLogger(HTMLUtils.class.getName()).log(Level.SEVERE, null, ex);
        }

      */
        int ch;
        String temp2 = "", gg = "";
        String [] temp = new String[1000];
        BufferedReader eof = new BufferedReader(new FileReader(path2+"corpus.txt"));
        //BufferedWriter eofw = new BufferedWriter(new FileWriter(File));
        while ((temp2 = eof.readLine()) != null){
            gg += temp2;
        }
        //gg = "asdfsdf. asdfasllas fjs. asdfasdf";
        temp = gg.split("\\.");
//        System.out.println(temp[0]);

        for(int i = 0; i<temp.length; i++){
            temp3 += temp[i]+"\n";
        }

        BufferedWriter cetak = new BufferedWriter(new FileWriter(path2+"corpus2.txt"));
        cetak.write(temp3);
        cetak.close();
  }
  public String ClearCss(){
        Process y = null;
        String temp3 = "", hasil2 = "", path2 = "/home/aharjunadhi/NetBeansProjects/Ekstraksi/sourcefile/";
        //String command = "sed '/{/d' "+path2+"corpus2.txt > "+path2+"corpus.txt";
        String command = "sed -e '/{/d' "+path2+"hasil.txt";
        try {
            y = Runtime.getRuntime().exec(new String[] {"sh", "-c", command});
            //y = Runtime.getRuntime().exec(command);
            BufferedReader reader = new BufferedReader(new InputStreamReader(y.getInputStream()));

            //System.out.println("TES ---- "+ reader.readLine());
            //BufferedReader reader = new BufferedReader(command);
            while((temp3 = reader.readLine())!=null){
                hasil2 += temp3;

                System.out.println("apa ajaaa...");
            }

            System.out.println("apa deh");
            BufferedWriter writer3 = new BufferedWriter(new FileWriter(path2+"corpus.txt"));
            writer3.write(hasil2);
            writer3.close();

        } catch (IOException ex) {
            Logger.getLogger(Extract2.class.getName()).log(Level.SEVERE, null, ex);
        }
        return hasil2;

    }

   public final static void main(String[] args) throws Exception{
    String path = "/home/aharjunadhi/NetBeansProjects/Ekstraksi/sourcefile/", temp = "";
    FileReader reader = new FileReader(path+"agastya.html");
    List<String> lines = HTMLUtils.extractText(reader);
    for (String line  : lines) {
      System.out.println(line);

      temp += line+"\n";
    }
    BufferedWriter dd = new BufferedWriter(new FileWriter(path+"hasil.txt"));
    dd.write(temp);
    dd.close();
    new HTMLUtils().ClearCss();
    new HTMLUtils().SatuBarisSatuKalimat();

    }
}


semoga bermanfaat..🙂

4 thoughts on “Ekstraksi text dari HTML Menggunakan HTMLEditorKit dan JEditorPane”

Leave a Reply

Fill in your details below or click an icon to log in:

WordPress.com Logo

You are commenting using your WordPress.com account. Log Out / Change )

Twitter picture

You are commenting using your Twitter account. Log Out / Change )

Facebook photo

You are commenting using your Facebook account. Log Out / Change )

Google+ photo

You are commenting using your Google+ account. Log Out / Change )

Connecting to %s