正文提取中用到的正则表达式

#region 相关正则表达式
 /// <summary>
 /// 去掉所有html标签
 /// </summary>
 private static readonly Regex FilterAll = new Regex(
 @"(\[([^=]*)(=[^\]]*)?\][\s\S]*?\[/\1\])|(?<lj>(?=[^\u4E00-\u9FA5\uFE30-\uFFA0,."");])<a\s+[^>]*>[^<]{2,}</a>(?=[^\u4E00-\u9FA5\uFE30-\uFFA0,."");]))|(?<Style><style[\s\S]+?/style>)|(?<select><select[\s\S]+?/select>)|(?<Script><script[\s\S]*?/script>)|(?<Explein><\!\-\-[\s\S]*?\-\->)|(?<li><li(\s+[^>]+)?>[\s\S]*?/li>)|(?<Html></?\s*[^> ]+(\s*[^=>]+?=[‘""]?[^""‘]+?[‘""]?)*?[^\[<]*>)|(?<Other>&[a-zA-Z]+;)|(?<Other2>\#[a-z0-9]{6})|(?<Space>\s+)|(\&\#\d+\;)",
 RegexOptions.ExplicitCapture
 | RegexOptions.Multiline
 | RegexOptions.IgnoreCase); //(?<Link><a[\s\S]*?</a>)|
 //(?<Style><style[\s\S]+?/style>)|(?<select><select[\s\S]+?/select>)|(?<Script><script[\s\S]*?/script>)|(?<Explein><\!\-\-[\s\S]*?\-\->)|(?<li><li(\s+[^>]+)?>[\s\S]*?/li>)|(?<Html></?\s*[^> ]+(\s*[^=>]+?=[‘""]?[^""‘]+?[‘""]?)*?[^\[<]*>)|(?<Other>&[a-zA-Z]+;)|(?<Other2>\#[a-z0-9]{6})|(?<Space>\s+)
 /// <summary>
 /// 找出title标签
 /// </summary>
 private static readonly Regex FindTitle = new Regex(
 @"<\s*/?title\s*>",
 RegexOptions.ExplicitCapture
 | RegexOptions.Multiline
 | RegexOptions.IgnoreCase);
 /// <summary>
 /// 找出title标签内容
 /// </summary>
 private static readonly Regex FindTitleContent = new Regex(
 @"<\s*/?title\s*>(?<Content>[\s\S]*?)<\s*/?title\s*>",
 RegexOptions.ExplicitCapture
 | RegexOptions.Multiline
 | RegexOptions.IgnoreCase);
 /// <summary>
 /// 找出h 和Strong标签
 /// </summary>
 private static readonly Regex FindHStrong = new Regex(
 @"<\s*/?h\s*>|<\s*/?strong\s*>",
 RegexOptions.ExplicitCapture
 | RegexOptions.Multiline
 | RegexOptions.IgnoreCase);
 /// <summary>
 /// 找出p 和br标签
 /// </summary>
 private static readonly Regex FindPB = new Regex(
 @"<\s*/?p\s*>|<\s*br\s*/?>|<\s*/?tr\s*>",
 RegexOptions.ExplicitCapture
 | RegexOptions.Multiline
 | RegexOptions.IgnoreCase);
 /// <summary>
 /// 找出nbsp标签
 /// </summary>
 private static readonly Regex FindNbsp = new Regex(
 @"&nbsp",
 RegexOptions.ExplicitCapture
 | RegexOptions.Multiline
 | RegexOptions.IgnoreCase);
 /// <summary>
 /// 找出结尾标签
 /// </summary>
 private static readonly Regex FindS = new Regex(
 @"(?<Content>[\s\S]*?)\$",
 RegexOptions.ExplicitCapture
 | RegexOptions.Multiline
 | RegexOptions.IgnoreCase);
 /// <summary>
 /// 找出是否为标准句
 /// </summary>
 private static readonly Regex IsSen = new Regex(
 @"[,.,。!!;;::……??《》“”""]",
 RegexOptions.ExplicitCapture
 | RegexOptions.Multiline
 | RegexOptions.IgnoreCase);
 /// <summary>
 /// 找出是否为垃圾句[strong][h]标签过多的
 /// </summary>
 private static readonly Regex IsWs = new Regex(
 @"\[\(h\)\]",
 RegexOptions.ExplicitCapture
 | RegexOptions.Multiline
 | RegexOptions.IgnoreCase);
 /// <summary>
 /// 找出是否为垃圾句冒号和·-过多的
 /// </summary>
 private static readonly Regex IsWsM = new Regex(
 @"\[·]|[-]|[::]",
 RegexOptions.ExplicitCapture
 | RegexOptions.Multiline
 | RegexOptions.IgnoreCase);
 /// <summary>
 /// 找出是否为BBS特征
 /// </summary>
 private static readonly Regex IsBbsInfo = new Regex(
 @"第[^楼]{1,50}楼|Powered\s*/?by[\s\S]*?Dvbbs|Powered\s*/?by[\s\S]*?Discuz",
 RegexOptions.ExplicitCapture
 | RegexOptions.Multiline
 | RegexOptions.IgnoreCase);

 /// <summary>
 /// 取KEYWORD
 /// </summary>
 private static readonly Regex mKeyWord = new Regex(
 @"<meta\s*name\s*=\s*[‘""]?keywords[‘""]?\s*content\s*=\s*[‘""]?(?<KeyWords>[^‘"">]*)[‘""]?[^>]*>|<meta\s*content\s*=\s*[‘""]?(?<KeyWords>[^‘"">]*)[‘""]?\s*name\s*=\s*[‘""]?keywords[‘""]?\s*[^>]*>
",RegexOptions.ExplicitCapture| RegexOptions.Multiline| RegexOptions.IgnoreCase);
 /// <summary>
 /// 取DESCRIPTION
 /// </summary>
 private static readonly Regex mDescription = new Regex(
 @"<meta\s*name\s*=\s*[‘""]?description[‘""]?\s*content\s*=\s*[‘""]?(?<description>[^‘"">]*)[‘""]?[^>]*>|<meta\s*content\s*=\s*[‘""]?(?<description>[^‘"">]*)[‘""]?\s*name\s*=\s*[‘""]?description[‘""]?\s*[^>]*>
",RegexOptions.ExplicitCapture| RegexOptions.Multiline| RegexOptions.IgnoreCase);

 /// <summary>
 /// 取Tags
 /// </summary>
 private static readonly Regex mTag = new Regex(
 @"<meta\s*name\s*=\s*[‘""]?tagwords[‘""]?\s*content\s*=\s*[‘""]?(?<tagwords>[^‘"">]*)[‘""]?[^>]*>|<meta\s*content\s*=\s*[‘""]?(?<tagwords>[^‘"">]*)[‘""]?\s*name\s*=\s*[‘""]?tagwords[‘""]?\s*[^>]*>
", RegexOptions.ExplicitCapture | RegexOptions.Multiline | RegexOptions.IgnoreCase);
 /// <summary>
 /// 找出是否为垃圾句:后字符号过少,:号前无“说”字,:号后无"关于"
 /// </summary>
 private static readonly Regex IsWsMM = new Regex(
 @"^[^说\s]{0,8}?[::].{0,10}$",
 RegexOptions.ExplicitCapture
 | RegexOptions.Multiline
 | RegexOptions.IgnoreCase);
 /// <summary>
 /// 找出spider写入的url标记
 /// </summary>
 private static readonly Regex txtUrl = new Regex(
 @"当前URL为:http://(?<URL>.*)",
 RegexOptions.ExplicitCapture
 | RegexOptions.Multiline
 | RegexOptions.IgnoreCase);
 /// <summary>
 /// 找出spider写入的锚点描述标记
 /// </summary>
 private static readonly Regex txtDescription = new Regex(
 @"当前链接描述为:(?<Describe>.*)",
 RegexOptions.ExplicitCapture
 | RegexOptions.Multiline
 | RegexOptions.IgnoreCase);
 ///// <summary>
 ///// 取需要a标签
 ///// </summary>
 //private static readonly Regex cleanFirst = new Regex(
 // @"([\u4E00-\u9FA5]|[\uFE30-\uFFA0]|[,."");])(?<Robbish1><a\s+[^>]*>)[^<]{1,6}(?<Robbish2></a>)([\u4E00-\u9FA5]|[\uFE30-\uFFA0]|[,."");])", RegexOptions.ExplicitCapture | RegexOptions.Multiline | RegexOptions.IgnoreCase);
 #endregion

  

时间: 2024-12-23 15:09:10

正文提取中用到的正则表达式的相关文章

项目中用了汉字正则表达式,出现异常:Cannot merge new index 65993 into a non-jumbo instruction

在项目中用了汉字正则表达式,编译并运行,Eclipse控制台输出如下异常信息: Unable to execute dex: Cannot merge newindex 65993 into a non-jumbo instruction!? Conversion to Dalvikformat failed: Unable to execute dex: Cannot merge new index 65993 into anon-jumbo instruction! 解决方法: 将dex.f

python通用论坛正文提取\python论坛评论提取\python论坛用户信息提取

本人长期出售超大量微博数据,并提供特定微博数据打包,Message to [email protected] 背景 参加泰迪杯数据挖掘竞赛,这次真的学习到了不少东西,最后差不多可以完成要求的内容,准确率也还行.总共的代码,算上中间的过程处理也不超过500行,代码思想也还比较简单,主要是根据论坛的短文本特性和楼层之间内容的相似来完成的.(通俗点说就是去噪去噪去噪,然后只留下相对有规律的日期,内容) 前期准备 软件和开发环境: Pycharm,Python2.7,Linux系统 用的主要Python

c#第五次作业---正文提取

1.正文文本 1.正文文本 watermark/2/text/aHR0cDovL2Jsb2cuY3Nkbi5uZXQv/font/5a6L5L2T/fontsize/400/fill/I0JBQkFCMA==/dissolve/70/gravity/Center" width="400" height="200" style="border:none; max-width:100%"> 2.带标签文本 3.原始网页 http://

开发中用到的正则表达式

2-38范围中取偶数,不要20的正则表达式 String RegularExpression = "^[2468]$|[13]{1}[02468]|2[2468]";1-39范围中取奇数 String RegularExpression = "^[13579]$|[13]{1}[13579]|2[13579]";

[scrapy] scrapy 使用goose作为正文提取

import scrapy from goose import Goose class Article(scrapy.Item): title = scrapy.Field() text = scrapy.Field() class MyGooseSpider(scrapy.Spider): name = 'goose' start_urls = [ 'http://blog.scrapinghub.com/2014/06/18/extracting-schema-org-microdata-u

js中用到的正则表达式

1.匹配IP地址 function isIP(value){ return /^((25[0-5]|2[0-4][0-9]|1[0-9]{2}|[0-9]{1,2})\.){3}(25[0-5]|2[0-4][0-9]|1[0-9]{2}|[0-9]{1,2})$/i.test(value); } 2.匹配URL function isURL(value){ return /^((http|https):\/\/(\w+:{0,1}\w*@)?(\S+)|)(:[0-9]+)?(\/|\/([\

jmeter正则表达式提取器--关联

http://desert3.iteye.com/blog/1394934 1.http://www.cnblogs.com/quange/archive/2010/06/11/1756260.html 2.http://blog.csdn.net/zhangren07/archive/2010/10/15/5944158.aspx <input type="hidden" name="hidName" value="(.*)">  

Jmeter正则表达式提取器的使用方法(转)

下面简单介绍一下Jmeter正则表达式提取器的使用方法. 1.添加Jmeter正则表达式提取器:在具体的Request下添加Jmeter正则表达式提取器(Jmeter正则表达式在“后置处理器”下面)  例1如下: 引用名称: tokenid(自己定义) 正则表达式:<input type="hidden" name="org.apache.struts.taglib.html.TOKEN" value="(.*?)"> 模板:$1$

Jmeter正则表达式提取多个值示例

首先了解一下常用正则表达式的语法 \d           数字 \w          数字或者字母 .             可以匹配任意字符 星号*     表示任意个字符 +          表示至少一个字符 ?           表示0或者1个字符 {n}        表示n个字符 {n,m}    表示n-m个字符 \s         表示空白符 括号[]   表示范围,比如: [0-9a-zA-Z\_] 可以匹配一个数字.字母或者下划线 ^    表示行的开头,^\d表示