这里先说一下,实习期的一个项目,当时并没有该合作公司的获取新闻的接口,但是项目又急着上线,所以总监就让我来做一个简单的抓取,现将主要的工具类NewsUtil.java贴出来供大家参考。
NewsUtil.java
1 package org.news.util; 2 3 import java.io.BufferedReader; 4 import java.io.IOException; 5 import java.io.InputStream; 6 import java.io.InputStreamReader; 7 import java.net.URL; 8 import java.net.URLConnection; 9 import java.util.ArrayList; 10 import java.util.regex.Matcher; 11 import java.util.regex.Pattern; 12 13 /** 14 * 抓取新闻内容的辅助类 15 * @author geenkDC 16 * @time 2015-07-28 15:15:04 17 */ 18 public class NewsUtil { 19 /** 20 * 通过提交的URL来抓取出新闻的链接 21 * @param url 22 * @return 23 * @throws Exception 24 */ 25 public static ArrayList<String> findUrlByUrl(String url) throws Exception 26 { 27 URL url0=new URL(url); 28 ArrayList<String> urlList=new ArrayList<String>(); 29 URLConnection con; 30 BufferedReader br=null; 31 try { 32 con = url0.openConnection(); 33 InputStream in=con.getInputStream(); 34 br=new BufferedReader(new InputStreamReader(in)); 35 String str=""; 36 while((str=br.readLine())!=null) 37 { 38 urlList.addAll(findUrl(str)); 39 } 40 } catch (IOException e) { 41 throw new RuntimeException("URL读写错误:"+e.getMessage()); 42 } 43 if(br!=null) 44 { 45 try { 46 br.close(); 47 } catch (IOException e) { 48 throw new RuntimeException("URL流关闭异常:"+e.getMessage()); 49 } 50 } 51 return urlList; 52 } 53 54 /**抓取新闻URL的真正实现类 55 * @param str 56 * @return 57 */ 58 public static ArrayList<String> findUrl(String str) 59 { 60 ArrayList<String> urlList=new ArrayList<String>(); 61 //匹配新闻的URL 62 String regex="http://[a-zA-Z0-9_\\.:\\d/?=&%]+\\.jhtml"; 63 Pattern p=Pattern.compile(regex); 64 Matcher m=p.matcher(str); 65 //找符合正则匹配的字串 66 while(m.find()) 67 { 68 String subStr=m.group().substring(m.group().lastIndexOf("/")+1, m.group().lastIndexOf(".jhtml")); 69 70 try { 71 if (subStr.matches("[0-9]*")) { 72 urlList.add(m.group()); 73 74 } 75 } catch (Exception e) { 76 throw new RuntimeException("匹配新闻URL出错:"+e.getMessage()); 77 } 78 } 79 return urlList; 80 } 81 82 /** 83 * 根据URL找到其的新闻内容 84 * @param url 85 * @return 86 * @throws Exception 87 */ 88 public static ArrayList<String> findContentByUrl(String url) throws Exception { 89 URL url1=new URL(url); 90 ArrayList<String> conList=new ArrayList<String>(); 91 URLConnection con; 92 BufferedReader br=null; 93 try { 94 con = url1.openConnection(); 95 InputStream in=con.getInputStream(); 96 InputStreamReader isr=new InputStreamReader(in, "utf-8"); 97 br=new BufferedReader(isr); 98 String str=""; 99 StringBuffer sb=new StringBuffer(); 100 while((str=br.readLine())!=null) 101 { 102 sb.append(str); 103 } 104 conList.addAll(findContent(sb.toString())); 105 } catch (IOException e) { 106 throw new RuntimeException("URL读写错误:"+e.getMessage()); 107 } 108 if(br!=null) 109 { 110 try { 111 br.close(); 112 } catch (IOException e) { 113 throw new RuntimeException("URL流关闭异常:"+e.getMessage()); 114 } 115 } 116 return conList; 117 } 118 119 /** 120 * 抓取新闻内容的真正实现类 121 * @param str 122 * @return 123 */ 124 public static ArrayList<String> findContent(String str) { 125 ArrayList<String> strList=new ArrayList<String>(); 126 //匹配新闻内容div 127 String regex="<div class=\"con_box\">([\\s\\S]*)</div>([\\s\\S]*)<div class=\"left_con\">"; 128 Pattern p=Pattern.compile(regex); 129 Matcher m=p.matcher(str); 130 //找符合正则匹配的字串 131 while(m.find()) 132 { 133 try { 134 strList.add(new String(m.group())); 135 } catch (Exception e) { 136 throw new RuntimeException("抓取新闻内容出错:"+e.getMessage()); 137 } 138 } 139 return strList; 140 } 141 }
功能简单说明:
只要输入网站首页的url,程序会自动获取匹配的新闻条目的url,再根据每个新闻条目的url抓取该新闻的左右内容。
时间: 2024-10-26 09:49:06