'html'에 해당되는 글 4건
- 2009.01.12 html tag filter(2)
package filter;
import java.io.*;
import java.net.*;
import java.nio.ByteBuffer;
import java.nio.channels.FileChannel;
import java.nio.charset.Charset;
import java.util.StringTokenizer;
import java.util.Vector;
public class HtmlReader {
private URL myURL = null;
private InputStream in = null;
private BufferedReader br = null;
private String[] charsetList = {"UTF-8", "EUC-JP", "EUC-KR",
"ISO-8859", "US-ASCII", "UTF-16", "UTF-16BE", "UTF-16LE",
"utf-8", "euc-jp", "euc-kr", "iso-8859", "us-ascii", "utf-16",
"utf-16be", "utf-16le"};
private String usedCharset = "UTF-8"; // default charset : UTF-8
private final String fileName = "tmp.html";
public HtmlReader(String strURL){
try {
usedCharset = this.getCharSet(strURL);
myURL = new URL(strURL);
this.readPage();
} catch (MalformedURLException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}
public void readPage(){
String data;
FileChannel channel = null;
ByteBuffer inBuf = null;
try{
in = myURL.openStream();
br = new BufferedReader(new InputStreamReader(in, usedCharset));
Charset charset = Charset.forName(usedCharset);
FileOutputStream out = new FileOutputStream(fileName);
while((data = br.readLine()) != null){
inBuf = charset.encode(data);
// System.out.println(data);
channel = out.getChannel();
channel.write(inBuf);
}
out.close();
br.close();
} catch(Exception e){
e.printStackTrace();
} finally {
if (channel != null){
try {
channel.close();
}
catch(IOException ex){}
}
}
}
// CharSet을 가져온다.
public String getCharSet(String strURL){
String data;
String charset = null;
try{
myURL = new URL(strURL);
in = myURL.openStream();
br = new BufferedReader(new InputStreamReader(in));
while((data = br.readLine()) != null){
for(int i=0; i<charsetList.length; i++){
if(data.contains(charsetList[i])){
charset = charsetList[i];
break;
}
}
}
br.close();
} catch(Exception e){
e.printStackTrace();
}
return charset;
}
// html 파일 중에서 text만 가져 와서 txt파일을 만든다.
public void parserText(String ofn){
FileInputStream fis = null;
FileOutputStream out = null;
BufferedReader br = null;
FileChannel channel = null;
ByteBuffer inbBuf = null;
try {
fis = new FileInputStream(fileName);
br = new BufferedReader(new InputStreamReader(fis, usedCharset));
Charset charset = Charset.forName(usedCharset);
out = new FileOutputStream(ofn);
String data = null;
StringBuffer strBuf = new StringBuffer();
while((data=br.readLine())!=null){ //블록 시작전까지는 같은 지역으로 인식 i값을 주지 않아도 초기화에러가 나지 않는다.
strBuf.append(data);
}
strBuf = tagFilter(strBuf);
inbBuf = charset.encode(strBuf.toString());
channel = out.getChannel();
channel.write(inbBuf);
fis.close();
br.close();
} catch(IOException ex) {
System.out.println("입출력 예외: " + ex.getMessage());
} finally {
if (channel != null){
try {
channel.close();
}
catch(IOException ex){}
}
}
}
// html 파일 중에서 text만 가져 와서 String을 리턴.
public String parserStr(){
FileInputStream fis = null;
BufferedReader br = null;
FileChannel channel = null;
StringBuffer strBuf = null;
try {
fis = new FileInputStream(fileName);
br = new BufferedReader(new InputStreamReader(fis, usedCharset));
String data = null;
strBuf = new StringBuffer();
while((data=br.readLine())!=null){ //블록 시작전까지는 같은 지역으로 인식 i값을 주지 않아도 초기화에러가 나지 않는다.
strBuf.append(data);
}
strBuf = tagFilter(strBuf);
fis.close();
} catch(IOException ex) {
System.out.println("입출력 예외: " + ex.getMessage());
} finally {
if (channel != null){
try {
channel.close();
}
catch(IOException ex){}
}
}
return strBuf.toString();
}
// 해당 keyword가 포함된 문장을 리턴 한다.
public boolean containKeyWord(String str, String keyWord){
boolean isContain = false;
if(str.contains(keyWord)){
isContain = true;
}
return isContain;
}
public String[] split(String target, String token){
StringTokenizer st = new StringTokenizer(target, token);
String[] ret = new String[st.countTokens()];
String split = null;
int i = 0;
while(st.hasMoreElements()){
split = st.nextToken();
if(token=="<"){
ret[i] = token + split;
i++;
}else if(token==">"){
if(split.contains("<")){
ret[i] = split + token;
i++;
}else{
ret[i] = split;
i++;
}
}else {
ret[i] = split;
i++;
}
}
return ret;
}
// html코드 중에 tag을 걸려 낸다.
public StringBuffer tagFilter(StringBuffer strBuf){
String target = strBuf.toString();
String preTag[] = null;
String retStr = null;
Vector<String[]> strSplit = new Vector<String[]>();
StringBuffer retBuf = new StringBuffer();
preTag = split(target, "<");
for(int i=0; i<preTag.length; i++){
String[] split = split(preTag[i], ">");
strSplit.add(split);
}
int flag = 0; // 첫번째 문장
for(int i=0; i<strSplit.size(); i++) {
String[] ret = strSplit.get(i);
for(int j=0; j<ret.length; j++){
if((!ret[j].contains("<"))
&&((ret[j]!=" "))
&&((ret[j]!=null))
&&(!(ret[j].equals(null)))
&&(!(ret[j].equals(" ")))
&&(!ret[j].contains(">"))){
if(flag==0){
retStr = ret[j] + "\n";
flag++;
}else{
retStr += ret[j] + "\n";
}
}
}
}
System.out.println(retStr);
retBuf.append(retStr.replace(" ", "").replace("\t", ""));
return retBuf;
}
// 검색된 웹페이지의 링크주소를 가져 온다.
public String getLink(String fileName){
String result = null;
return result;
}
public static void main(String[] args){
HtmlReader page = new HtmlReader("http://localhost:8080/WebCharTest/test.html");
page.parserText("result.txt");
// System.out.println(page.parserStr());
}
}