解析得到除去标签的txt内容

复制代码 代码如下:

  NodeList body_nodes=this.getParser().parse(body_filter);

  for(int i=0;i<body_nodes.size();i++)

  {

  Node node=body_nodes.elementAt(i);

  Parser body_parser=new Parser(node.toHtml());

  TextExtractingVisitor visitor=new TextExtractingVisitor();

  body_parser.visitAllNodesWith(visitor);

  body.append(visitor.getExtractedText());

  }

  TextExtractingVisitor,visitAllNodesWith等类及方法都是Visitor中比较很重要但也很少见的。

  下面附源代码:

  

复制代码 代码如下:

  import java.io.BufferedWriter;

  import java.io.File;

  import java.io.FileWriter;

  import java.io.IOException;

  import java.util.Date;

  import org.htmlparser.Node;

  import org.htmlparser.NodeFilter;

  import org.htmlparser.Parser;

  import org.htmlparser.filters.AndFilter;

  import org.htmlparser.filters.HasAttributeFilter;

  import org.htmlparser.filters.HasChildFilter;

  import org.htmlparser.filters.TagNameFilter;

  import org.htmlparser.util.NodeList;

  import org.htmlparser.visitors.TextExtractingVisitor;

  import com.extractor.Extractor;

  public class ExtractorHangdian extends Extractor{

  public void extract()

  {

  BufferedWriter bw=null;

  String indextime;

  String title;

  StringBuffer body=new StringBuffer();;

  NodeFilter time_filter=new AndFilter(new TagNameFilter("font"),new HasAttributeFilter("color","#808080"));

  NodeFilter title_filter1=new AndFilter(new TagNameFilter("td"),new HasChildFilter(new TagNameFilter("b")));

  NodeFilter body_filter=new AndFilter(new TagNameFilter("td"),new HasChildFilter(new TagNameFilter("p")));

  try

  {

  NodeList title_nodes=this.getParser().parse(title_filter1);

  Node node=title_nodes.elementAt(0);

  NodeList node2=node.getChildren();

  //title=node2.elementAt(0).toHtml(); /* '\r\n' */

  //title=node2.elementAt(1).toHtml(); /*font color="#000080" style="font-size:14.4px*/

  //title=node2.elementAt(2).toHtml(); /* b */

  title=node2.elementAt(3).toHtml(); /* 教材征订及教师用书登记通知 */

  bw=new BufferedWriter(new FileWriter(new File(this.getOutputPath()+title+".txt")));

  String url_seg1=getInputFilePath().substring(3,30);

  int end=getInputFilePath().lastIndexOf(".");

  String url_seg2=getInputFilePath().substring(30, end);

  String url_seg=url_seg1+".asp?"+url_seg2;

  url_seg=url_seg.replaceAll("\\\\","/");

  String url="http://"+url_seg;

  bw.write(url+NEWLINE);

  bw.write(title+NEWLINE);

  }

  catch(Exception e)

  {

  e.printStackTrace();

  }

  this.getParser().reset();

  try

  {

  NodeList time_nodes=this.getParser().parse(time_filter);

  Node time_node=time_nodes.elementAt(1);//这里的“1”表示符合time_filter的第二个元素

  indextime=time_node.getNextSibling().toHtml();

  bw.write(indextime+NEWLINE);

  }

  catch(Exception e)

  {

  e.printStackTrace();

  }

  this.getParser().reset();//得到除去标签的所有txt文本

  try

  {

  NodeList body_nodes=this.getParser().parse(body_filter);

  for(int i=0;i<body_nodes.size();i++)

  {

  Node node=body_nodes.elementAt(i);

  Parser body_parser=new Parser(node.toHtml());

  TextExtractingVisitor visitor=new TextExtractingVisitor();

  body_parser.visitAllNodesWith(visitor);

  body.append(visitor.getExtractedText());

  }

  bw.write(body+NEWLINE);

  }

  catch(Exception e)

  {

  e.printStackTrace();

  }

  try

  {

  if(bw!=null)

  bw.close();

  }catch(IOException e)

  {

  e.printStackTrace();

  }

  }

  }

  这里顺便提一下,当年bw没有关掉,怎么读不进去,搞了我好几天,郁闷死了,想起来就火大,注意!!