站内搜索: 请输入搜索关键词

当前页面: 开发资料首页JSP 专题将HTML转化为TEXT的Java类

将HTML转化为TEXT的Java类

摘要: 将HTML转化为TEXT的Java类


  为了支持全文检索,有必要将HTML格式的文章转化为纯文本格式,因此我设计了一个基本的WebFormatter类,提供一个简单的public static String html2text(String html),将HTML格式转化为Text:

/*
* File: WebFormatter.java
* Created on 2005-6-24
* Author: Liao Xuefeng, asklxf@163.com
* Copyright (C) 2005, Liao Xuefeng.
*/
package com.mboker.blog.web.util;

import java.util.*;
import java.text.SimpleDateFormat;

/**
* Do some format on web display.
*
* @author Xuefeng
*/
public class WebFormatter {

public static String html2text(String html) {
StringBuffer sb = new StringBuffer(html.length());
char[] data = html.toCharArray();
int start = 0;
boolean previousIsPre = false;
Token token = null;
for(;;) {
token = parse(data, start, previousIsPre);
if(token==null)
break;
previousIsPre = token.isPreTag();
sb = sb.append(token.getText());
start += token.getLength();
}
return sb.toString();
}

private static Token parse(char[] data, int start, boolean previousIsPre) {
if(start>=data.length)
return null;
// try to read next char:
char c = data[start];
if(c=='<') {
// this is a tag or comment or script:
int end_index = indexOf(data, start+1, '>');
if(end_index==(-1)) {
// the left is all text!
return new Token(Token.TOKEN_TEXT, data, start, data.length, previousIsPre);
}
String s = new String(data, start, end_index-start+1);
// now we got s="<...>":
if(s.startsWith("

private static int indexOf(char[] data, int start, String s) {
char[] ss = s.toCharArray();
// TODO: performance can improve!
for(int i=start; i<(data.length-ss.length); i++) {
// compare from data[i] with ss[0]:
boolean match = true;
for(int j=0; j if(data[i+j]!=ss[j]) {
match = false;
break;
}
}
if(match)
return i;
}
return (-1);
}

private static int indexOf(char[] data, int start, char c) {
for(int i=start; i if(data[i]==c)
return i;
}
return (-1);
}

}

class Token {

public static final int TOKEN_TEXT = 0; // html text.
public static final int TOKEN_COMMENT = 1; // comment like

private static final char[] TAG_BR = " private static final char[] TAG_P = " private static final char[] TAG_LI = " private static final char[] TAG_PRE = " private static final char[] TAG_HR = "

private static final char[] END_TAG_TD = "</td>".toCharArray();
private static final char[] END_TAG_TR = "</tr>".toCharArray();
private static final char[] END_TAG_LI = "".toCharArray();

private static final Map SPECIAL_CHARS = new HashMap();

private int type;
private String html; // original html
private String text = null; // text!
private int length = 0; // html length
private boolean isPre = false; // isPre tag?

static {
SPECIAL_CHARS.put("&quot;", "\"");
SPECIAL_CHARS.put("&lt;", "<");
SPECIAL_CHARS.put("&gt;", ">");
SPECIAL_CHARS.put("&amp;", "&");
SPECIAL_CHARS.put("&reg;", "(r)");
SPECIAL_CHARS.put("&copy;", "(c)");
SPECIAL_CHARS.put("&nbsp;", " ");
SPECIAL_CHARS.put("&pound;", "?");
}

public Token(int type, char[] data, int start, int end, boolean previousIsPre) {
this.type = type;
this.length = end - start;
this.html = new String(data, start, length);
System.out.println("[Token] html=" + html + ".");
parseText(previousIsPre);
System.out.println("[Token] text=" + text + ".");
}

public int getLength() {
return length;
}

public boolean isPreTag() {
return isPre;
}

private void parseText(boolean previousIsPre) {
if(type==TOKEN_TAG) {
char[] cs = html.toCharArray();
if(compareTag(TAG_BR, cs) || compareTag(TAG_P, cs))
text = "\n";
else if(compareTag(TAG_LI, cs))
text = "\n* ";
else if(compareTag(TAG_PRE, cs))
isPre = true;
else if(compareTag(TAG_HR, cs))
text = "\n--------\n";
else if(compareString(END_TAG_TD, cs))
text = "\t";
else if(compareString(END_TAG_TR, cs) || compareString(END_TAG_LI, cs))
text = "\n";
}
// text token:
else if(type==TOKEN_TEXT) {
text = toText(html, previousIsPre);
}
}

public String getText() {
return text==null ? "" : text;
}

private String toText(String html, final boolean isPre) {
char[] cs = html.toCharArray();
StringBuffer buffer = new StringBuffer(cs.length);
int start = 0;
boolean continueSpace = false;
char current, next;
for(;;) {
if(start>=cs.length)
break;
current = cs[start]; // read current char
if(start+1 next = cs[start+1];
else
next = '\0';
if(current==' ') {
if(isPre || !continueSpace)
buffer = buffer.append(' ');
continueSpace = true;
// continue loop:
start++;
continue;
}
// not ' ', so:
if(current=='\r' && next=='\n') {
if(isPre)
buffer = buffer.append('\n');
// continue loop:
start+=2;
continue;
}
if(current=='\n' || current=='\r') {
if(isPre)
buffer = buffer.append('\n');
// continue loop:
start++;
continue;
}
// cannot continue space:
continueSpace = false;
if(current=='&') {
// maybe special char:
int length = readUtil(cs, start, ';', 10);
if(length==(-1)) { // just '&':
buffer = buffer.append('&');
// continue loop:
start++;
continue;
}
else { // check if special character:
String spec = new String(cs, start, length);
String specChar = (String)SPECIAL_CHARS.get(spec);
if(specChar!=null) { // special chars!
buffer = buffer.append(specChar);
// continue loop:
start+=length;
continue;
}
else { // check if like '&#1234':
if(next=='#') { // maybe a char
String num = new String(cs, start+2, length-3);
try {
int code = Integer.parseInt(num);
if(code>0 && code<65536) { // this is a special char:
buffer = buffer.append((char)code);
// continue loop:
start++;
continue;
}
}
catch(Exception e) {}
// just normal char:
buffer = buffer.append("&#");
// continue loop:
start+=2;
continue;
}
else { // just '&':
buffer = buffer.append('&');
// continue loop:
start++;
continue;
}
}
}
}
else { // just a normal char!
buffer = buffer.append(current);
// continue loop:
start++;
continue;
}
}
return buffer.toString();
}

// read from cs[start] util meet the specified char 'util',
// or null if not found:
private int readUtil(final char[] cs, final int start, final char util, final int maxLength) {
int end = start+maxLength;
if(end>cs.length)
end = cs.length;
for(int i=start; i if(cs[i]==util) {
return i-start+1;
}
}
return (-1);
}

// compare standard tag "<input" with tag "<input value=aa>" if(ori_tag.length>=tag.length)
return false;
for(int i=0; i if(Character.toLowerCase(tag[i])!=ori_tag[i])
return false;
}
// the following char should not be a-z:
if(tag.length>ori_tag.length) {
char c = Character.toLowerCase(tag[ori_tag.length]);
if(c<'a' || c>'z')
return true;
return false;
}
return true;
}

private boolean compareString(final char[] ori, char[] comp) {
if(ori.length>comp.length)
return false;
for(int i=0; i if(Character.toLowerCase(comp[i])!=ori[i])
return false;
}
return true;
}

public String toString() {
return html;
}
}

  注意,请先将html中的<body>...</body>部分提取出来,再交给WebFormatter处理,因为html->text转换实质是删除所有标签(某些标签如
被转化为'\n')、Script和注释,对于JavaScript生成的动态内容(例如document.write)无能为力。

</td> </tr> <tr> <td vAlign=top align=left height="100%">
↑返回目录
前一篇: 用于MIDP的URLEncoder类
后一篇: Swt常用控件中文教程