crawler_基于块儿统计正文抽取_改进版

import java.util.ArrayList;

import java.util.Arrays;

import java.util.List;

import java.util.regex.Pattern;
/**

 * <p>

 * 在线性时间内抽取主题类（新闻、博客等）网页的正文。 采用了<b>基于行块分布函数</b>的方法，为保持通用性没有针对特定网站编写规则。

 * </p>

 *

 * @author Chen Xin([email protected]) Created on 2009-1-11 Updated on

 *         2010-08-09

 * @note cphmvp 优化性能提速三倍，优化列表页中含其它标签识别 2014-6-5 11:53:33

 */

public class TextExtract {
private List<String> lines;

    private final static int blocksWidth = 3;

    private int threshold;

    private String html;

    private boolean flag;

    private int start;

    private int end;

    private StringBuilder text;

    private ArrayList<Integer> indexDistribution;
public TextExtract() {

        lines = new ArrayList<String>();

        indexDistribution = new ArrayList<Integer>();

        text = new StringBuilder();

        flag = false;

        /* 当待抽取的网页正文中遇到成块的新闻标题未剔除时，只要增大此阈值即可。 */

        /* 阈值增大，准确率提升，召回率下降；值变小，噪声会大，但可以保证抽到只有一句话的正文 */

        threshold = -1;

    }
/**

     * 抽取网页正文，不判断该网页是否是目录型。即已知传入的肯定是可以抽取正文的主题类网页。

     *

     * @param _html

     *            网页HTML字符串

     *

     * @return 网页正文string

     */

    public String parse(String _html) {

        return parse(_html, false);

    }
/**

     * 判断传入HTML，若是主题类网页，则抽取正文；否则输出<b>"unkown"</b>。

     *

     * @param _html

     *            网页HTML字符串

     * @param _flag

     *            true进行主题类判断, 省略此参数则默认为false

     *

     * @return 网页正文string<br/>

     * @note 2014年6月4日11:55:51 修复图片标签空格处理 cphmvp

     */

    public String parse(String _html, boolean _flag) {

        flag = _flag;

        html = _html;

        html = preProcess(html);

        // System.out.println(html);

        return getText().replaceAll("imgsrc=", "img src=");

    }
private static int FREQUENT_URL = 30;

    private static Pattern links = Pattern

            .compile(

                    "<[aA]\\s+[Hh][Rr][Ee][Ff]=[\"|\‘]?([^>\"\‘ ]+)[\"|\‘]?\\s*[^>]*>([^>]+)</a>(\\s*.{0,"

                            + FREQUENT_URL

                            + "}\\s*<a\\s+href=[\"|\‘]?([^>\"\‘ ]+)[\"|\‘]?\\s*[^>]*>([^>]+)</[aA]>){2,100}",

                    Pattern.DOTALL);
private static String preProcess(String source) {
source = source.replaceAll("(?is)<!DOCTYPE.*?>", "");

        source = source.replaceAll("(?is)<!--.*?-->", ""); // remove html

                                                            // comment

        source = source.replaceAll("(?is)<script.*?>.*?</script>", ""); // remove

                                                                        // javascript

        source = source.replaceAll("(?is)<style.*?>.*?</style>", ""); // remove

                                                                        // css

        source = source.replaceAll("&.{2,5};|&#.{2,5};", " "); // remove special

                                                                // char
// 剔除连续成片的超链接文本（认为是，广告或噪音）,超链接多藏于span中

        source = source.replaceAll("<[sS][pP][aA][nN].*?>", "");

        source = source.replaceAll("</[sS][pP][aA][nN]>", "");
// int len = source.length();

        // while ((source = links.matcher(source).replaceAll("")).length() !=

        // len) {

        // len = source.length();

        // }

        // ;

        // 提升性能三倍 cphmvp

        source = source.replaceAll(" ", "");

        // [\\s\\S]{0,30} 用以表示a之间的间隙容忍度

        String regex = "<[a|A][^>]*?>[^>]+</[a|A]>(?:\\s*[\\s\\S]{0,30}\\s*<[a|A][^>]*?>[^>]+</[a|A]>){2,100}";

        source = source.replaceAll(regex, "");

        // continue;
// source = links.matcher(source).replaceAll("");
// 防止html中在<>中包括大于号的判断

        source = source.replaceAll("<[^>‘\"]*[‘\"].*[‘\"].*?>", "");
source = source.replaceAll("<.*?>", "");

        source = source.replaceAll("<.*?>", "");

        source = source.replaceAll("\r\n", "\n");
return source;
}
private String getText() {

        lines = Arrays.asList(html.split("\n"));

        indexDistribution.clear();
int empty = 0;// 空行的数量

        for (int i = 0; i < lines.size() - blocksWidth; i++) {
if (lines.get(i).length() == 0) {

                empty++;

            }
int wordsNum = 0;

            for (int j = i; j < i + blocksWidth; j++) {

                lines.set(j, lines.get(j).replaceAll("\\s+", ""));

                wordsNum += lines.get(j).length();

            }

            indexDistribution.add(wordsNum);

            // System.out.println(wordsNum);

        }

        int sum = 0;
for (int i = 0; i < indexDistribution.size(); i++) {

            sum += indexDistribution.get(i);

        }

        // 正文全部script情况，抽取不到正文

        if (indexDistribution.size() == 0)

            return "";

        threshold = Math.min(100, (sum / indexDistribution.size()) << (empty

                / (lines.size() - empty) >>> 1));

        threshold = Math.max(50, threshold);
start = -1;

        end = -1;

        boolean boolstart = false, boolend = false;

        boolean firstMatch = true;// 前面的标题块往往比较小，应该减小与它匹配的阈值

        text.setLength(0);
StringBuilder buffer = new StringBuilder();

        for (int i = 0; i < indexDistribution.size() - 1; i++) {
if (firstMatch && !boolstart) {

                if (indexDistribution.get(i) > (threshold / 2) && !boolstart) {

                    if (indexDistribution.get(i + 1).intValue() != 0

                            || indexDistribution.get(i + 2).intValue() != 0) {

                        firstMatch = false;

                        boolstart = true;

                        start = i;

                        continue;

                    }

                }
}

            if (indexDistribution.get(i) > threshold && !boolstart) {

                if (indexDistribution.get(i + 1).intValue() != 0

                        || indexDistribution.get(i + 2).intValue() != 0

                        || indexDistribution.get(i + 3).intValue() != 0) {

                    boolstart = true;

                    start = i;

                    continue;

                }

            }

            if (boolstart) {

                if (indexDistribution.get(i).intValue() == 0

                        || indexDistribution.get(i + 1).intValue() == 0) {

                    end = i;

                    boolend = true;

                }

            }
if (boolend) {

                buffer.setLength(0);

                // System.out.println(start+1 + "\t\t" + end+1);

                for (int ii = start; ii <= end; ii++) {

                    if (lines.get(ii).length() < 5)

                        continue;

                    buffer.append(lines.get(ii) + "\n");

                }

                String str = buffer.toString();

                // System.out.println(str);

                if (str.contains("Copyright") || str.contains("版权所有"))

                    continue;

                text.append(str);

                boolstart = boolend = false;

            }

        }
if (start > end) {

            buffer.setLength(0);

            int size_1 = lines.size() - 1;

            for (int ii = start; ii <= size_1; ii++) {

                if (lines.get(ii).length() < 5)

                    continue;

                buffer.append(lines.get(ii) + "\n");

            }

            String str = buffer.toString();

            // System.out.println(str);

            if ((!str.contains("Copyright")) || (!str.contains("版权所有"))) {

                text.append(str);

            }

        }
return text.toString();

    }
public static void main(String[] args) {

        System.out.println("===============");

        String s = "<img  class=‘fit-image‘ onload=‘javascript:if(this.width>498)this.width=498;‘ />hello";

        // source = source.replaceAll("<[^‘\"]*[‘\"].*[‘\"].*?>", "");

        System.out.println(TextExtract.preProcess(s));

    }

}

时间： 2024-12-13 09:57:03

crawler_基于块儿统计正文抽取_改进版的相关文章

网页正文抽取

转自丕子:http://www.zhizhihu.com/html/y2013/4202.html 总结我用过的网页正文抽取工具: decruft http://t.cn/S7bVEC python-readabilityhttp://t.cn/zYeoZ8b boilerpipe http://t.cn/h41EEs python-boilerpipehttp://t.cn/zYeoyPw pismo http://t.cn/zYeoyP2 Goose http://t.cn/zYeoZ8G

基于区间统计的颜色直方图图像匹配算法

算法的原理在: 点击打开链接原理大概意思是:将R,G,B各分量信息颜色信息划分为 N 区间. 例如下图:4X4X4 的区间 red 0-63 64-127 128-191 192-255 blue 0-63 43 78 18 0 64-127 45 67 33 2 128-191 127 58 25 8 192-255 140 47 47 13 在统计各个区间内的像素数: ... MATLAB 代码实现就算函数是dhist.m 文件参数 bins 是需要划分的区间,filename

[个人网站搭建]·极简方式统计个人网页访问量（基于百度统计）

[个人网站搭建]·极简方式统计个人网页访问量(基于百度统计) 个人主页--> https://xiaosongshine.github.io/ 个人网站搭建github地址:https://github.com/xiaosongshine/djangoWebs 建好了网站,我们可以利用百度统计,可以很简单的看到自己网页的被访问次数. 使用方式也特别简单,只需要注册百度统计账户,然后把一段代码拷贝到你的网页里,十分简单方便. 实践演示照片: ? 1.注册百度统计站长版百度搜索"百度统计&q

基于文本密度的新闻正文抽取方法之Python实现

参考文章链接: http://www.cnblogs.com/jasondan/p/3497757.html http://d.wanfangdata.com.cn/Patent/CN201410007832.6/ 基于网页分析构思出的正文提取算法回顾以上的网页分析,如果按照文本密度来找提取正文,那么就是写这么一个算法,能够从过滤html标签后的文本中找到正文文本的起止行号,行号之间的文本就是网页正文部分. 还是从上面三个网页的分析结果看,他们都有这么一个特性:正文部分的文本密度要高出非正文部

网页正文抽取（包含提取图片）

转自:http://bbs.it-home.org/thread-12676-1-1.html /** *@author Xin Chen *Created on 2009-11-11 *Updated on 2010-08-09 *Email: [email protected] *Blog: http://hi.baidu.com/爱心同盟_陈鑫 *Modified By : Yang @ http://www.chainlt.com */ import java.util.ArrayLis

基于图的关键词抽取

项目研究背景: 在关键词抽取研究中,最常用的一种方法就是通过计算一篇文档中词语的TF-IDF值(term frequency-inverse document frequency),并对它们进行排序选取TopK个作为关键词,这是一种无监督的方法.另外一种方法是通过有监督的方法,通过训练学习一个分类器,将关键词抽取问题转化为对每个词语的二分类问题,从而选择出合适的关键词. 无监督和有监督各有各的优势和缺点:无监督学习,不需要人工标注,训练集合的过程,因此更加方便和快捷:然而监督学习的方法,在进行了

异常检测(2)——基于概率统计的异常检测（1）

某个工厂生产了一批手机屏幕,为了评判手机屏幕的质量是否达到标准,质检员需要收集每个样本的若干项指标,比如大小.质量.光泽度等,根据这些指标进行打分,最后判断是否合格.现在为了提高效率,工厂决定使用智能检测进行第一步筛选,质检员只需要重点检测被系统判定为“不合格”的样本. 智能检测程序需要根据大量样本训练一个函数模型,也许我们的第一个想法是像监督学习那样,为样本打上“正常”和“异常”的标签,然后通过分类算法训练模型.假设xtest是数据样本,predict(xtest)来判断xtest是否是合格样

Oracle 分组统计，抽取每组前十

/**2018年6月14日潮州ORACEL 统计2017年用电量,按行业分类抽取用电量前十*/select * from (select t.yhbh 用户编号, t.yhmc 用户名称, t.jldbh 计量点编号, (select m.dmbmmc from npmis_xt_dmbm m where m.dmfl = 'YDLXDM' and m.dmbm = t.ydlbdm) 用电类型, (select m.dmbmmc from npmis_xt_dmbm m where m.dm

Codevs_1040_[NOIP2001]_统计单词个数_(划分型动态规划)

描述 http://codevs.cn/problem/1040/ 与Codevs_1017_乘积最大很像,都是划分型dp. 给出一个字符串和几个单词,要求将字符串划分成k段,在每一段中求共有多少单词(两个单词不能共享第一个字母),将每一段中的单词个数相加,求最大值. 1040 统计单词个数 2001年NOIP全国联赛提高组时间限制: 1 s 空间限制: 128000 KB 题目等级 : 黄金 Gold 题目描述 Description 给出一个长度不超过200的由小写英文字母组成的字母串(约