진화를 꿈꾸다..

test

| JAVA/JSP/Source 2009. 1. 8. 17:56

Posted by S & S

package filter;

import java.io.*;
import java.net.*;
import java.nio.ByteBuffer;
import java.nio.channels.FileChannel;
import java.nio.charset.Charset;

public class HtmlReader {
private URL myURL = null;
private InputStream in = null;
private BufferedReader br = null;
private String[] charsetList = {"UTF-8", "EUC-JP", "EUC-KR", "ISO-8859", "US-ASCII", "UTF-16", "UTF-16BE", "UTF-16LE", "utf-8", "euc-jp", "euc-kr", "iso-8859", "us-ascii", "utf-16", "utf-16be", "utf-16le"};
private String[] filterChar = {"a", "b", "c", "d", "e", "f", "g",
   "h", "i", "j", "k", "l", "m", "n", "o", "p", "q", "r", "s",
   "t", "u", "v", "w", "x", "y", "z", "A", "B", "C", "D", "E",
   "F", "G", "H", "I", "J", "K", "L", "M", "N", "O", "P", "Q",
   "R", "S", "T", "U", "V", "W", "X", "Y", "Z", "!", "@", "#",
   "$", "%", "^", "&", "*", "(", ")", "_", "+", "|", "1", "2",
   "3", "4", "5", "6", "7", "8", "9", "0", "-", "=", "<", ">",
   "?", "/", ":", ";", "'", "{", "}", "[", "]", "~", "`"};
private String usedCharset = "UTF-8"; // default charset : UTF-8
private final String fileName = "tmp.html";

public HtmlReader(String strURL){
  try {
   usedCharset = this.getCharSet(strURL);
   myURL = new URL(strURL);
   this.readPage();
  } catch (MalformedURLException e) {
   // TODO Auto-generated catch block
   e.printStackTrace();
  }
}

public void readPage(){
  String data;
  FileChannel channel = null;
  ByteBuffer inBuf = null;

  try{
   in = myURL.openStream();
   br = new BufferedReader(new InputStreamReader(in, usedCharset));
   Charset charset = Charset.forName(usedCharset);
   FileOutputStream out = new FileOutputStream(fileName);

   while((data = br.readLine()) != null){
    inBuf = charset.encode(data);
//    System.out.println(data);
    channel = out.getChannel();
    channel.write(inBuf);
   }
   out.close();
   br.close();
  } catch(Exception e){
   e.printStackTrace();
  } finally {
   if (channel != null){
       try {
       channel.close();
       }
       catch(IOException ex){}
   }
  }
}

// CharSet을 가져온다.
public String getCharSet(String strURL){
  String data;
  String charset = null;
  try{
   myURL = new URL(strURL);
   in = myURL.openStream();
   br = new BufferedReader(new InputStreamReader(in));

   while((data = br.readLine()) != null){
    for(int i=0; i<charsetList.length; i++){
     if(data.contains(charsetList[i])){
      charset = charsetList[i];
      break;
     }
    }
   }
   br.close();
  } catch(Exception e){
   e.printStackTrace();
  }
  return charset;
}

// html 파일 중에서 text만 가져 와서 txt파일을 만든다.
public void parserText(String ofn){
  FileInputStream fis = null;
  FileOutputStream out = null;
  BufferedReader br = null;
  FileChannel channel = null;
  ByteBuffer inbBuf = null;

  try {
   fis = new FileInputStream(fileName);
   br = new BufferedReader(new InputStreamReader(fis, usedCharset));

   Charset charset = Charset.forName(usedCharset);
   out = new FileOutputStream(ofn);
         String data = null;
         StringBuffer strBuf = new StringBuffer();

         while((data=br.readLine())!=null){ //블록 시작전까지는 같은 지역으로 인식 i값을 주지 않아도 초기화에러가 나지 않는다.
         strBuf.append(data);
         strBuf = tagFilter(strBuf);
         }

//         strBuf = tagFilter(strBuf);

         inbBuf = charset.encode(strBuf.toString());
        channel = out.getChannel();
   channel.write(inbBuf);

   fis.close();
   br.close();
  } catch(IOException ex) {
   System.out.println("입출력 예외: " + ex.getMessage());
  } finally {
   if (channel != null){
       try {
       channel.close();
       }
       catch(IOException ex){}
   }
  }
}

// html 파일 중에서 text만 가져 와서 String을 리턴.
public String parserStr(){
  FileInputStream fis = null;
  BufferedReader br = null;
  FileChannel channel = null;
  StringBuffer strBuf = null;

  try {
   fis = new FileInputStream(fileName);
   br = new BufferedReader(new InputStreamReader(fis, usedCharset));

         String data = null;
         strBuf = new StringBuffer();

         while((data=br.readLine())!=null){ //블록 시작전까지는 같은 지역으로 인식 i값을 주지 않아도 초기화에러가 나지 않는다.
         strBuf.append(data);
         }

         strBuf = tagFilter(strBuf);

   fis.close();
  } catch(IOException ex) {
   System.out.println("입출력 예외: " + ex.getMessage());
  } finally {
   if (channel != null){
       try {
       channel.close();
       }
       catch(IOException ex){}
   }
  }
  return strBuf.toString();
}

// html코드 중에 tag을 걸려 낸다.
public StringBuffer tagFilter(StringBuffer strBuf){
  System.out.println(strBuf.capacity());
//  for(int i=0; i<strBuf.capacity(); i++){
//   for(int j=0; j<filterChar.length; j++){
//    if(strBuf.toString().contains(filterChar[j])){
////     System.out.println("in");
//     strBuf.delete(strBuf.indexOf(filterChar[j]), strBuf.indexOf(filterChar[j])+1);
//    }
//   }
//  }
  return strBuf;
}

// 검색된 웹페이지의 링크주소를 가져 온다.
public String getLink(String fileName){
  String result = null;
  return result;
}

public static void main(String[] args){
  HtmlReader page = new HtmlReader("http://www.naver.com");
  page.parserText("result.txt");
//  System.out.println(page.parserStr());
}
}

저작자표시 비영리 동일조건

'JAVA/JSP > Source' 카테고리의 다른 글

html tag filter(2) (0)	2009.01.12
html tag filter (0)	2009.01.09
xmlParser (0)	2009.01.07
해당 url html 소스 가져오기 (0)	2009.01.07
게시판, 자료실, XML과 JSP, JDOM, JSTL, Custom TAG (0)	2009.01.07

And

일	월	화	수	목	금	토
						1
2	3	4	5	6	7	8
9	10	11	12	13	14	15
16	17	18	19	20	21	22
23	24	25	26	27	28

진화를 꿈꾸다..

Article Category

Notice

Recent Article

Recent Comment

Recent Trackback

test

'JAVA/JSP > Source' 카테고리의 다른 글

Tag Cloud

Calendar

Archive

My Link

티스토리툴바