'html' 태그의 글 목록 (2 Page)

'html'에 해당되는 글 4건

2009.01.12 html tag filter(2)

html tag filter(2)

| JAVA/JSP/Source 2009. 1. 12. 17:06

Posted by S & S

package filter;

import java.io.*;
import java.net.*;
import java.nio.ByteBuffer;
import java.nio.channels.FileChannel;
import java.nio.charset.Charset;
import java.util.StringTokenizer;
import java.util.Vector;

public class HtmlReader {
private URL myURL = null;
private InputStream in = null;
private BufferedReader br = null;
private String[] charsetList = {"UTF-8", "EUC-JP", "EUC-KR",
   "ISO-8859", "US-ASCII", "UTF-16", "UTF-16BE", "UTF-16LE",
   "utf-8", "euc-jp", "euc-kr", "iso-8859", "us-ascii", "utf-16",
   "utf-16be", "utf-16le"};
private String usedCharset = "UTF-8"; // default charset : UTF-8
private final String fileName = "tmp.html";

public HtmlReader(String strURL){
  try {
   usedCharset = this.getCharSet(strURL);
   myURL = new URL(strURL);
   this.readPage();
  } catch (MalformedURLException e) {
   // TODO Auto-generated catch block
   e.printStackTrace();
  }
}

public void readPage(){
  String data;
  FileChannel channel = null;
  ByteBuffer inBuf = null;

  try{
   in = myURL.openStream();
   br = new BufferedReader(new InputStreamReader(in, usedCharset));
   Charset charset = Charset.forName(usedCharset);
   FileOutputStream out = new FileOutputStream(fileName);

   while((data = br.readLine()) != null){
    inBuf = charset.encode(data);
//    System.out.println(data);
    channel = out.getChannel();
    channel.write(inBuf);
   }
   out.close();
   br.close();
  } catch(Exception e){
   e.printStackTrace();
  } finally {
   if (channel != null){
       try {
       channel.close();
       }
       catch(IOException ex){}
   }
  }
}

// CharSet을 가져온다.
public String getCharSet(String strURL){
  String data;
  String charset = null;
  try{
   myURL = new URL(strURL);
   in = myURL.openStream();
   br = new BufferedReader(new InputStreamReader(in));

   while((data = br.readLine()) != null){
    for(int i=0; i<charsetList.length; i++){
     if(data.contains(charsetList[i])){
      charset = charsetList[i];
      break;
     }
    }
   }
   br.close();
  } catch(Exception e){
   e.printStackTrace();
  }
  return charset;
}

// html 파일 중에서 text만 가져 와서 txt파일을 만든다.
public void parserText(String ofn){
  FileInputStream fis = null;
  FileOutputStream out = null;
  BufferedReader br = null;
  FileChannel channel = null;
  ByteBuffer inbBuf = null;

  try {
   fis = new FileInputStream(fileName);
   br = new BufferedReader(new InputStreamReader(fis, usedCharset));

   Charset charset = Charset.forName(usedCharset);
   out = new FileOutputStream(ofn);
         String data = null;
         StringBuffer strBuf = new StringBuffer();

         while((data=br.readLine())!=null){ //블록 시작전까지는 같은 지역으로 인식 i값을 주지 않아도 초기화에러가 나지 않는다.
         strBuf.append(data);
         }

         strBuf = tagFilter(strBuf);

         inbBuf = charset.encode(strBuf.toString());
        channel = out.getChannel();
   channel.write(inbBuf);

   fis.close();
   br.close();
  } catch(IOException ex) {
   System.out.println("입출력 예외: " + ex.getMessage());
  } finally {
   if (channel != null){
       try {
       channel.close();
       }
       catch(IOException ex){}
   }
  }
}

// html 파일 중에서 text만 가져 와서 String을 리턴.
public String parserStr(){
  FileInputStream fis = null;
  BufferedReader br = null;
  FileChannel channel = null;
  StringBuffer strBuf = null;

  try {
   fis = new FileInputStream(fileName);
   br = new BufferedReader(new InputStreamReader(fis, usedCharset));

         String data = null;
         strBuf = new StringBuffer();

         while((data=br.readLine())!=null){ //블록 시작전까지는 같은 지역으로 인식 i값을 주지 않아도 초기화에러가 나지 않는다.
         strBuf.append(data);
         }

         strBuf = tagFilter(strBuf);

   fis.close();
  } catch(IOException ex) {
   System.out.println("입출력 예외: " + ex.getMessage());
  } finally {
   if (channel != null){
       try {
       channel.close();
       }
       catch(IOException ex){}
   }
  }
  return strBuf.toString();
}

// 해당 keyword가 포함된 문장을 리턴 한다.
public boolean containKeyWord(String str, String keyWord){
  boolean isContain = false;
  if(str.contains(keyWord)){
   isContain = true;
  }
  return isContain;
}

public String[] split(String target, String token){
  StringTokenizer st = new StringTokenizer(target, token);
  String[] ret = new String[st.countTokens()];
  String split = null;

  int i = 0;
  while(st.hasMoreElements()){
   split = st.nextToken();
   if(token=="<"){
    ret[i] = token + split;
    i++;
   }else if(token==">"){
    if(split.contains("<")){
     ret[i] = split + token;
     i++;
    }else{
     ret[i] = split;
     i++;
    }
   }else {
    ret[i] = split;
    i++;
   }
  }

  return ret;
}

// html코드 중에 tag을 걸려 낸다.
public StringBuffer tagFilter(StringBuffer strBuf){
  String target = strBuf.toString();
  String preTag[] = null;
  String retStr = null;
  Vector<String[]> strSplit = new Vector<String[]>();
  StringBuffer retBuf = new StringBuffer();

  preTag = split(target, "<");
  for(int i=0; i<preTag.length; i++){
   String[] split = split(preTag[i], ">");
   strSplit.add(split);
  }

  int flag = 0; // 첫번째 문장
  for(int i=0; i<strSplit.size(); i++) {
   String[] ret = strSplit.get(i);
   for(int j=0; j<ret.length; j++){
    if((!ret[j].contains("<"))
       &&((ret[j]!=" "))
       &&((ret[j]!=null))
       &&(!(ret[j].equals(null)))
       &&(!(ret[j].equals(" ")))
       &&(!ret[j].contains(">"))){
     if(flag==0){
       retStr = ret[j] + "\n";
       flag++;
     }else{
      retStr += ret[j] + "\n";
     }
     }
   }
  }
  System.out.println(retStr);

  retBuf.append(retStr.replace(" ", "").replace("\t", ""));

  return retBuf;
}

// 검색된 웹페이지의 링크주소를 가져 온다.
public String getLink(String fileName){
  String result = null;
  return result;
}

public static void main(String[] args){
  HtmlReader page = new HtmlReader("http://localhost:8080/WebCharTest/test.html");
  page.parserText("result.txt");
//  System.out.println(page.parserStr());
}
}

저작자표시 비영리 동일조건 (새창열림)

'JAVA/JSP > Source' 카테고리의 다른 글

TagFilter (0)	2009.01.16
Morphemer (0)	2009.01.16
html tag filter (0)	2009.01.09
test (0)	2009.01.08
xmlParser (0)	2009.01.07

And

| 1 | 2 | 3 | 4 |

일	월	화	수	목	금	토
		1	2	3	4	5
6	7	8	9	10	11	12
13	14	15	16	17	18	19
20	21	22	23	24	25	26
27	28	29	30	31

진화를 꿈꾸다..

Article Category

Notice

Recent Article

Recent Comment

Recent Trackback

'html'에 해당되는 글 4건

html tag filter(2)

'JAVA/JSP > Source' 카테고리의 다른 글

Tag Cloud

Calendar

Archive

My Link

티스토리툴바