用java实现新浪爬虫,代码完整剖析(仅针对当前SinaSignOn有效)

先来看我们的web.xml文件,如下

 1 <!DOCTYPE web-app PUBLIC
 2  "-//Sun Microsystems, Inc.//DTD Web Application 2.3//EN"
 3  "http://java.sun.com/dtd/web-app_2_3.dtd" >
 4
 5 <web-app>
 6   <display-name>MySinaSpider</display-name>
 7     <listener>
 8         <listener-class>main.java.sina.spider.StartSpiderLisenter</listener-class>
 9       </listener>
10 </web-app>

这样的配置当启动tomcat的时候,就会运行爬虫,然后再看我们的StartSpiderLisenter类,如下

 1 package main.java.sina.spider;
 2
 3 import javax.servlet.ServletContextEvent;
 4 import javax.servlet.ServletContextListener;
 5 import main.java.sina.bean.info.LoginInfo;
 6 import main.java.sina.utils.Constant;
 7
 8 public class StartSpiderLisenter implements ServletContextListener{
 9
10     public void contextDestroyed(ServletContextEvent arg0) {
11
12     }
13
14     public void contextInitialized(ServletContextEvent arg0) {
15         Constant.personalHomePage = "http://weibo.com/zhaoyao2012/home"; //填写你自己的新浪微博个人主页
16         LoginInfo.username = "***"; //填写你的新浪微博用户名
18         LoginInfo.password = "***"; //填写你的新浪微博密码
19         Constant.enableProxy = false; //是否使用代理
20         Spider.start();
21     }
22
23 }

很明显我们看到StartSpiderLisenter 类是继承自ServletContextListener这个接口,一定要实现它的两个方法,contextInitialized和contextDestroyed.它们分别在初始化和销毁的时候被容器调用。我们看到在contextInitialized初始化上下文的方法中调用了Spider.start()方法。那么我们来看看Spider这个类,如下:

  1 package main.java.sina.spider;
  2
  3 import java.io.IOException;
  4 import java.util.regex.Matcher;
  5 import java.util.regex.Pattern;
  7 import org.quartz.JobBuilder;
  8 import org.quartz.JobDetail;
  9 import org.quartz.Scheduler;
 10 import org.quartz.SchedulerException;
 11 import org.quartz.SchedulerFactory;
 12 import org.quartz.SimpleScheduleBuilder;
 13 import org.quartz.SimpleTrigger;
 14 import org.quartz.TriggerBuilder;
 15 import org.quartz.impl.StdSchedulerFactory;
 17 import main.java.sina.bean.info.LoginInfo;
 18 import main.java.sina.httpclient.LoginSina;
 19 import main.java.sina.httpclient.SpiderSina;
 20 import main.java.sina.job.KeywordSearchJob;
 21 import main.java.sina.utils.Constant;
 22 import main.java.sina.utils.HttpHelper;
 23 import main.java.test.SpiderTest;
 24
 25 public class Spider {
 26
 27     public static void main(String[] args) {
 28
 29         Constant.personalHomePage = "****";
 30         LoginInfo.username = "****";
 31         LoginInfo.password = "****";
 32         Constant.enableProxy = false;
 33         Constant.hourbefore = 0;  //这个参数用于设置时差
 34         start();
 35
 36     }
 37     public static void start() {
 38
 39         final SchedulerFactory factory = new StdSchedulerFactory();
 40         try {
 41             Scheduler scheduler = factory.getScheduler();
 42             JobDetail jobDetail = JobBuilder.newJob(KeywordSearchJob.class)
 43                     .withIdentity("keywordSearch", "weibo").build();
 44             SimpleTrigger trigger = TriggerBuilder.newTrigger()
 45                     .withIdentity("keywordSearch", "weibo")
 46                     .withSchedule(SimpleScheduleBuilder.repeatHourlyForever())
 47                     .build();
 48             scheduler.scheduleJob(jobDetail, trigger);
 49             scheduler.start();
 50         } catch (SchedulerException e) {
 51             e.printStackTrace();
 52         }
 53     }
 54
 55     public static SpiderSina createSpider() {
 56         LoginSina ls = new LoginSina(LoginInfo.username, LoginInfo.password);
 57         ls.dologinSina();
 58         ls.redirect();
 59         SpiderSina spider = new SpiderSina(ls);
 60
 61         return spider;
 62     }
 63
 64     public static void sendMidsofDays(SpiderSina spider,String keyword, String fromdate,
 65             String todate) {
 66
 67         try {
 68             String midsString = "";
 69             for (int i = 1; i <= 50; i++) {
 70                 String htmlContent = spider
 71                         .search(keyword, i, fromdate, todate);
 72                 if (htmlContent.contains("noresult_support")) {
 73                     break;
 74                 }
 75                 System.out.println(i);
 76                 Pattern pattern = Pattern.compile("<div mid=\"([0-9]*)\"");
 77
 78                 String start = "\"pid\":\"pl_weibo_direct\"";
 79                 try {
 80                     htmlContent = htmlContent.substring(htmlContent
 81                             .indexOf(start));
 82                 } catch (Exception e) {
 83                     htmlContent = htmlContent.substring(1);
 84                 }
 85                 htmlContent = htmlContent.replace("\\\"", "\"");
 86                 htmlContent = htmlContent.replace("\\/", "/");
 87                 Matcher matcher = pattern.matcher(htmlContent);
 88                 while (matcher.find()) {
 89                     System.out.println(matcher.group(1));
 90                     midsString += matcher.group(1) + ",";
 91                 }
 92                 if (i == 37) {
 93                     try {
 94                         Thread.sleep(1000 * 60 * 30);
 95                     } catch (InterruptedException e) {
 96                         e.printStackTrace();
 97                     }
 98                 }
 99             }
100             System.out.println(midsString);
101             HttpHelper.getLiveData(midsString, Constant.CommentUrl);
102         } catch (IOException e) {
103             e.printStackTrace();
104         }
105
106     }
107 }

我们在Spider.start()方法中,看到了作业KeywordSearchJob.class,那么我们来看看这个KeywordSearchJob类的实现,如下:

 1 package main.java.sina.job;
 2
 3 import org.quartz.Job;
 4 import org.quartz.JobExecutionContext;
 5 import org.quartz.JobExecutionException;
 6 import main.java.sina.httpclient.SpiderSina;
 7 import main.java.sina.spider.Spider;
 8 import main.java.sina.utils.Constant;
 9 import main.java.sina.utils.Utils;
10
11 public class KeywordSearchJob implements Job {
12
13     public void execute(JobExecutionContext arg0) throws JobExecutionException {
14
15         Constant.enableProxy = false; //我的爬虫中没有使用代理,故值设为false.
16         String keyword = "%25E5%25AE%2581%25E6%25B3%25A2%25E5%25A4%25A7%25E5%25AD%25A6";//被编码后的关键字
17         String datehour = Utils.getDateOfSpecifiedPreHour(Constant.hourbefore);//这个工具类实现了时差格式的转换
18         SpiderSina spider = Spider.createSpider();
19         spider.forwardToWeiboPage();
20         Spider.sendMidsofDays(spider,keyword,datehour,datehour);
21     }
22
23 }

接下来,我们看几个工具类的实现:首先来看下Utils.java这个类,如下:它实现了日期的格式的一些转换

  1 package main.java.sina.utils;
  2
  3 import java.io.BufferedReader;
  4 import java.io.BufferedWriter;
  5 import java.io.File;
  6 import java.io.FileInputStream;
  7 import java.io.FileNotFoundException;
  8 import java.io.FileOutputStream;
  9 import java.io.FileWriter;
 10 import java.io.IOException;
 11 import java.io.InputStream;
 12 import java.io.InputStreamReader;
 13 import java.io.StringReader;
 14 import java.io.UnsupportedEncodingException;
 15 import java.text.ParseException;
 16 import java.text.SimpleDateFormat;
 17 import java.util.Calendar;
 18 import java.util.Date;
 19 import java.util.Properties;
 20
 21 import org.htmlparser.Parser;
 22 import org.htmlparser.lexer.Lexer;
 23 import org.htmlparser.lexer.Page;
 24 import org.htmlparser.util.DefaultParserFeedback;
 25 //  I/O操作类
 26 public class Utils {
 27
 28     public static Date getDateFromString(String dtext,Date fileCreateDate) {
 29         Date date=null;
 30         int y,mm,se;
 31         Calendar c = Calendar.getInstance();
 32         c.setTime(fileCreateDate);
 33         y = c.get(Calendar.YEAR); //年
 34         //d = c.get(Calendar.DAY_OF_MONTH); //日
 35         mm = c.get(Calendar.MINUTE); //分
 36         se = c.get(Calendar.SECOND);//秒
 37         if(dtext.contains("秒前")){
 38             int end=0;
 39             for(int i=0;i<dtext.length();i++){
 40                 if(dtext.charAt(i)>=‘0‘ && dtext.charAt(i)<=‘9‘){
 41                     end++;
 42                 }else{
 43                     break;
 44                 }
 45             }
 46             dtext=dtext.substring(0,end);
 47             int second=Integer.parseInt(dtext);
 48             c.set(Calendar.SECOND, se-second);
 49             date=c.getTime();
 50         }
 51         else if(dtext.contains("分钟前")){
 52             int end=0;
 53             for(int i=0;i<dtext.length();i++){
 54                 if(dtext.charAt(i)>=‘0‘ && dtext.charAt(i)<=‘9‘){
 55                     end++;
 56                 }else{
 57                     break;
 58                 }
 59             }
 60             dtext=dtext.substring(0,end);
 61             int minute=Integer.parseInt(dtext);
 62             c.set(Calendar.MINUTE, mm-minute);
 63             date=c.getTime();
 64         }else if(dtext.contains("今天")){
 65              dtext=dtext.replace("今天 ", "").trim();
 66              String ss[]=dtext.split(":");
 67              if(ss!=null && ss.length==2){
 68                  c.set(Calendar.HOUR_OF_DAY, Integer.parseInt(ss[0]));
 69                  c.set(Calendar.MINUTE, Integer.parseInt(ss[1]));
 70                  date=c.getTime();
 71              }
 72         }else if(dtext.contains("月")){
 73             dtext=y+"年".concat(dtext);
 74             SimpleDateFormat sf=new SimpleDateFormat("yyyy年MM月dd日 HH:mm");
 75             try {
 76                 date=sf.parse(dtext);
 77             } catch (ParseException e) {
 78                 e.printStackTrace();
 79             }
 80         }else if(dtext.contains("-")){
 81             SimpleDateFormat sf=new SimpleDateFormat("yyyy-MM-dd HH:mm");
 82             try {
 83                 date=sf.parse(dtext);
 84             } catch (ParseException e) {
 85                 e.printStackTrace();
 86             }
 87         }
 88         return date;
 89     }
 90     public static void writeFileFromStream(String filename,InputStream in){
 91         if(filename==null || filename.trim().length()==0)
 92             return;
 93         File file=new File(filename);
 94         if(!file.exists()){
 95             try {
 96                 file.createNewFile();
 97             } catch (IOException e) {
 98                 e.printStackTrace();
 99             }
100         }
101         FileOutputStream fou=null;
102         try {
103             fou = new FileOutputStream(file);
104             byte []buffer=new byte[1024*4];
105             int len=-1;
106             while((len=in.read(buffer))!=-1){
107                 fou.write(buffer,0,len);
108             }
109         } catch (FileNotFoundException e) {
110             e.printStackTrace();
111         } catch (IOException e) {
112             e.printStackTrace();
113         }finally{
114             if(in!=null)
115                 try {
116                     in.close();
117                 } catch (IOException e) {
118                     e.printStackTrace();
119                 }
120             if(fou!=null)
121                 try {
122                     fou.close();
123                 } catch (IOException e) {
124                     e.printStackTrace();
125                 }
126         }
127     }
128     public static void writeFileFromString(String filename,String str){
129         if(filename==null || filename.trim().length()==0)
130             filename="tmp.txt";
131         File file=new File(filename);
132         if(!file.exists()){
133             try {
134                 file.createNewFile();
135             } catch (IOException e) {
136                 e.printStackTrace();
137             }
138         }
139         BufferedWriter writer=null;
140         BufferedReader reader=null;
141         try {
142             writer=new BufferedWriter(new FileWriter(file));
143             reader=new BufferedReader(new StringReader(str));
144             String tmp=null;
145             StringBuffer buffer=new StringBuffer();
146             while((tmp=reader.readLine())!=null)
147                 buffer.append(tmp+"\n");
148             writer.write(buffer.toString());
149
150         } catch (IOException e) {
151             e.printStackTrace();
152         }finally{
153             try {
154                 reader.close();
155                 writer.close();
156             } catch (IOException e) {
157                 e.printStackTrace();
158             }
159         }
160
161     }
162
163
164
165     public static String getStringFromStream(InputStream in) {
166         BufferedReader reader=null;
167         reader = new BufferedReader(new InputStreamReader(in));
168         StringBuffer buffer=new StringBuffer();
169         String str=null;
170         try{
171             while((str=reader.readLine())!=null){
172                 buffer.append(str+"\n");
173             }
174             reader.close();
175         }catch(Exception ex){
176             ex.printStackTrace();
177         }
178         try {
179             return new String(buffer.toString().getBytes(),"utf-8");
180         } catch (UnsupportedEncodingException e) {
181             e.printStackTrace();
182             return "error:"+e.getMessage();
183         }
184     }
185   //得到数据库的配置信息
186     public static Properties getDBconfig(){
187         Properties properties=new Properties();
188         InputStream in = null;
189         try {
190             in = new FileInputStream(new File("config/dbconfig.ini"));
191             properties.load(in);
192         } catch (FileNotFoundException e) {
193             e.printStackTrace();
194         } catch (IOException e) {
195             e.printStackTrace();
196         }finally{
197             if(in!=null)
198                 try {
199                     in.close();
200                 } catch (IOException e) {
201                     e.printStackTrace();
202                 }
203         }
204         return properties;
205     }
206
207     public static Parser createParser(String inputHTML) {
208         Lexer mLexer = new Lexer(new Page(inputHTML));
209         Parser parser = new Parser(mLexer, new DefaultParserFeedback(
210                 DefaultParserFeedback.QUIET));
211         return parser;
212     }
213
214     public static String getDateOfSpecifiedPreHour(int hourNum){
215         SimpleDateFormat sdFormat = new SimpleDateFormat("yyyy-MM-dd-HH");
216         Date date = new Date();
217         System.out.println("date -" +date + " " + hourNum);
218         Calendar calendar = Calendar.getInstance();
219         calendar.setTime(date);
220         calendar.add(Calendar.HOUR_OF_DAY, -1 * hourNum);
221         System.out.println("date2 -" +sdFormat.format(calendar.getTime()));
222         return sdFormat.format(calendar.getTime());
223     }
224 }

再来看一下ThreadPool.java这个类,如下:这是一个线程工具类,定义了线程的一些动作

 1 package main.java.sina.utils;
 2
 3 import java.util.List;
 4 import java.util.concurrent.ExecutorService;
 5 import java.util.concurrent.Executors;
 6
 7 /** 9  * 线程池工具类
10  */
11 public class ThreadPool {
12     private ExecutorService service;
13     private List<Thread> threadList;
14
15     public ThreadPool(int limite, List<Thread> threadList) {
16         this.service = Executors.newFixedThreadPool(limite);
17         this.threadList = threadList;
18     }
19
20     public void execute() {
21         if(threadList==null ||threadList.size()==0) return ;
22         for (int index = 0; index < threadList.size(); index++) {
23             Thread t=threadList.get(index);
24             service.execute(t);
25         }
26     }
27     public boolean isTerminated(){
28         return service.isTerminated();
29     }
30
31     public void shutDown() {
32         service.shutdown();
33     }
34 }

然后再看一下Constant.java这个常量类,如下:常量类把系统总用到的一些常量写在这里,以后项目维护需要更改的时候,方便维护更改

package main.java.sina.utils;

/**
 * @ClassName: Constant
 *
 */
public class Constant {
    public static boolean enableProxy = false;
    public static String liveCommentUrl = "http://localhost:8080/social-hub-connector/loadingLiveData";
    public static String CommentUrl = "http://localhost:8080/social-hub-connector/loadingData";
    public static String personalHomePage = "******";
    public static String weiboUsername = "*********";
    public static String weiboPassword = "*********";
    public static int hourbefore = 0;
}

再来看一下Base64Encoder.java类,它对一些字段进行了编码的类,如下:

 1 package main.java.sina.utils;
 2
 3 /**
 4  *  5  */
 6 public class Base64Encoder {
 7     private static final char last2byte = (char) Integer.parseInt("00000011", 2);
 8     private static final char last4byte = (char) Integer.parseInt("00001111", 2);
 9     private static final char last6byte = (char) Integer.parseInt("00111111", 2);
10     private static final char lead6byte = (char) Integer.parseInt("11111100", 2);
11     private static final char lead4byte = (char) Integer.parseInt("11110000", 2);
12     private static final char lead2byte = (char) Integer.parseInt("11000000", 2);
13     private static final char[] encodeTable = new char[]{‘A‘, ‘B‘, ‘C‘, ‘D‘, ‘E‘, ‘F‘, ‘G‘, ‘H‘, ‘I‘, ‘J‘, ‘K‘, ‘L‘, ‘M‘, ‘N‘, ‘O‘, ‘P‘, ‘Q‘, ‘R‘, ‘S‘, ‘T‘, ‘U‘, ‘V‘, ‘W‘, ‘X‘, ‘Y‘, ‘Z‘, ‘a‘, ‘b‘, ‘c‘, ‘d‘, ‘e‘, ‘f‘, ‘g‘, ‘h‘, ‘i‘, ‘j‘, ‘k‘, ‘l‘, ‘m‘, ‘n‘, ‘o‘, ‘p‘, ‘q‘, ‘r‘, ‘s‘, ‘t‘, ‘u‘, ‘v‘, ‘w‘, ‘x‘, ‘y‘, ‘z‘, ‘0‘, ‘1‘, ‘2‘, ‘3‘, ‘4‘, ‘5‘, ‘6‘, ‘7‘, ‘8‘, ‘9‘, ‘+‘, ‘/‘};
14
15     public Base64Encoder() {
16     }
17     public static  String encode(byte[] from) {
18         StringBuffer to = new StringBuffer((int) (from.length * 1.34) + 3);
19         int num = 0;
20         char currentByte = 0;
21         for (int i = 0; i < from.length; i++) {
22             num = num % 8;
23             while (num < 8) {
24                 switch (num) {
25                     case 0:
26                         currentByte = (char) (from[i] & lead6byte);
27                         currentByte = (char) (currentByte >>> 2);
28                         break;
29                     case 2:
30                         currentByte = (char) (from[i] & last6byte);
31                         break;
32                     case 4:
33                         currentByte = (char) (from[i] & last4byte);
34                         currentByte = (char) (currentByte << 2);
35                         if ((i + 1) < from.length) {
36                             currentByte |= (from[i + 1] & lead2byte) >>> 6;
37                         }
38                         break;
39                     case 6:
40                         currentByte = (char) (from[i] & last2byte);
41                         currentByte = (char) (currentByte << 4);
42                         if ((i + 1) < from.length) {
43                             currentByte |= (from[i + 1] & lead4byte) >>> 4;
44                         }
45                         break;
46                 }
47                 to.append(encodeTable[currentByte]);
48                 num += 6;
49             }
50         }
51         if (to.length() % 4 != 0) {
52             for (int i = 4 - to.length() % 4; i > 0; i--) {
53                 to.append("=");
54             }
55         }
56         return to.toString();
57     }
58 }

这个类中,针对新浪的一些特殊的加密规则,写的方法,这个在拼接最终的URl的时候回用到,如根据servertime+nonce两个参数来生成一串字符串加密规则:

 1 package main.java.sina.utils;
 2 import java.io.File;
 3 import java.io.FileReader;
 4
 5 import javax.script.Invocable;
 6 import javax.script.ScriptEngine;
 7 import javax.script.ScriptEngineManager;
 8
 9 /**
10  * 12  */
13 public class EncodeSuAndSp {
14     static ScriptEngineManager mgr = new ScriptEngineManager();
15     static ScriptEngine engine = mgr.getEngineByExtension("js");
16     static Invocable inv = (Invocable) engine;
17
18     public static String getEncryptedP(String password,String servertime,String nonce){
19         String value1="";
20         try {
21             engine.eval(new FileReader(new File("js/encrypt.js")));
22             value1 = String.valueOf(inv.invokeFunction("hex_sha1",password));
23             value1 = String.valueOf(inv.invokeFunction("hex_sha1",value1));
24             value1 = String.valueOf(inv.invokeFunction("hex_sha1",value1+servertime+nonce));
25         } catch (Exception e) {
26             e.printStackTrace();
27         }
28         return value1;
29     }
30
31
32     public static String getEncodedUsername(String username){
33         String value1="";
34         try {
35             engine.eval(new FileReader(new File("js/encrypt.js")));
36             value1 = String.valueOf(inv.invokeFunction("encode",username));
37             System.out.println(value1);
38         } catch (Exception e) {
39             e.printStackTrace();
40         }
41         return value1;
42     }
43 }
package main.java.sina.utils;
import java.io.UnsupportedEncodingException;
import java.net.URLDecoder;
import java.net.URLEncoder;
public class EncodeUtils {

    public static final String encodeURL(String str,String enc) {
        try {
            return URLEncoder.encode(str, enc);
        } catch (UnsupportedEncodingException e) {
            throw new RuntimeException(e);
        }
    }
    public static final String decodeURL(String str,String enc) {
        try {
            return URLDecoder.decode(str, enc);
        } catch (UnsupportedEncodingException e) {
            throw new RuntimeException(e);
        }
    }

    public static String unicdoeToGB2312(String str) {
        String res = null;
        if(str==null ){
            return "";
        }
        StringBuffer sb = new StringBuffer();
        try {
            while (str.length() > 0) {
                if (str.startsWith("\\u")) {
                    int x = 0;
                    try{
                        x = Integer.parseInt(str.substring(2, 6), 16);
                    }catch(Exception ex){
                        x=  0;
                    }
                    sb.append((char) x);
                    str = str.substring(6);
                } else {
                    sb.append(str.charAt(0));
                    str = str.substring(1);
                }
            }
            res = sb.toString();
        } catch (Exception e) {
            e.printStackTrace(System.err);
        }
        res=res.replaceAll("\\\\r", "")
            .replaceAll("\\\\n", "")
            .replaceAll("\\\\t", "")
            .replaceAll("&nbsp;", "")
            .replaceAll("&gt", "")
            .replaceAll("\\[", "\"")
            .replaceAll("\\]", "\"");
        return res;
    }

    public static String unicodeTogb2312(String str) {
        String res = null;
        StringBuffer sb = new StringBuffer();
        try {
            while (str.length() > 0) {
                if (str.startsWith("\\u")) {
                    int x = Integer.parseInt(str.substring(2, 6), 16);
                    sb.append((char) x);
                    str = str.substring(6);
                } else {
                    sb.append(str.charAt(0));
                    str = str.substring(1);
                }
            }
            res = sb.toString();
        } catch (Exception e) {
            e.printStackTrace(System.err);
        }
        res=res.replaceAll("\\\\r", "")
                .replaceAll("\\\\t", "")
                .replaceAll("&nbsp;", "")
                .replaceAll("&gt", "")
               .replaceAll("\\\\n", "");
        return res;
    }
}

这个类很关键HttpUtils.java类,这个方法中重写了doPost()和doGet()方法.如下:

package main.java.sina.utils;

import java.io.ByteArrayInputStream;
import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.util.ArrayList;
import java.util.List;
import java.util.Map;
import java.util.Random;
import java.util.Set;
import org.apache.http.Header;
import org.apache.http.HttpEntity;
import org.apache.http.HttpHost;
import org.apache.http.HttpResponse;
import org.apache.http.HttpVersion;
import org.apache.http.NameValuePair;
import org.apache.http.client.ClientProtocolException;
import org.apache.http.client.HttpClient;
import org.apache.http.client.entity.UrlEncodedFormEntity;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.client.methods.HttpPost;
import org.apache.http.client.methods.HttpUriRequest;
import org.apache.http.conn.params.ConnRoutePNames;
import org.apache.http.conn.params.ConnRouteParams;
import org.apache.http.cookie.Cookie;
import org.apache.http.entity.InputStreamEntity;
import org.apache.http.impl.client.DefaultHttpClient;
import org.apache.http.impl.conn.tsccm.ThreadSafeClientConnManager;
import org.apache.http.impl.cookie.BasicClientCookie;
import org.apache.http.message.BasicNameValuePair;
import org.apache.http.params.BasicHttpParams;
import org.apache.http.params.CoreProtocolPNames;
import org.apache.http.params.HttpParams;
import org.apache.http.params.HttpProtocolParams;
import org.apache.http.protocol.BasicHttpContext;
import org.apache.http.protocol.ExecutionContext;
import org.apache.http.protocol.HTTP;
import org.apache.http.protocol.HttpContext;

/**
 * http操作相关的类
 */
public class HttpUtils {
    /*
     * params :
     * url:  地址
     * headers请求头部信息
     * return : httpresponse响应
     */
    public static HttpResponse doGet(String url,Map<String,String> headers){
        HttpClient client=createHttpClient();
        HttpGet getMethod=new HttpGet(url);
        HttpResponse response=null;

        HttpContext httpContext = new BasicHttpContext();
        try {
            if(headers!=null && headers.keySet().size()>0){
                for(String key:headers.keySet()){
                    getMethod.addHeader(key, headers.get(key));
                }
            }
            response=client.execute(getMethod);
            HttpUriRequest realRequest  = (HttpUriRequest)httpContext.getAttribute(ExecutionContext.HTTP_REQUEST);
            System.out.println(realRequest.getURI());
        } catch (ClientProtocolException e) {
            e.printStackTrace();
        } catch (IOException e) {
            String msg=e.getMessage();
            if(msg.contains("Truncated chunk")){
                System.out.println(e.getMessage() +" 数据获取不完整,需要重新获取。");
            }else{
                System.out.println(e.getMessage() +" 连接被拒绝,需要降低爬取频率。");
            }
        } catch(Exception e){
        }
        System.out.println(response);
        return response;
    }

    /*
     * params :
     * url:  地址
     * headers:请求头部信息
     * params:post的请求数据
     * return : httpresponse响应
     */

    public static HttpResponse doPost(String url,Map<String,String> headers,Map<String,String> params){
        HttpClient client=createHttpClient();
        HttpPost postMethod=new HttpPost(url);
        HttpResponse response=null;
        try {
            if(headers!=null && headers.keySet().size()>0){
                for(String key:headers.keySet()){
                    postMethod.addHeader(key, headers.get(key));
                }
            }
            List<NameValuePair> p=null;
            if(params!=null && params.keySet().size()>0){
                p=new ArrayList<NameValuePair>();
                for(String key:params.keySet()){
                    p.add(new BasicNameValuePair(key,params.get(key)));
                }
            }
            if(p!=null)
                postMethod.setEntity(new UrlEncodedFormEntity(p,HTTP.UTF_8));
            response=client.execute(postMethod);
        } catch (ClientProtocolException e) {
            e.printStackTrace();
        } catch (IOException e) {
            e.printStackTrace();
        }
        return response;
    }

    //上传一个文件
    public static HttpResponse doPost(String url,Map<String,String> headers,String fileName){
        HttpClient client=createHttpClient();
        HttpPost postMethod=new HttpPost(url);
        String boundary = "";
        HttpResponse response=null;
        try {
            if(headers!=null && headers.keySet().size()>0){
                for(String key:headers.keySet()){
                    postMethod.addHeader(key, headers.get(key));
                    if(key.equals("Content-Type")){
                        String tmp=headers.get(key);
                        boundary=tmp.substring(tmp.indexOf("=")+1);
                    }
                }
            }
            File file=new File(fileName);
            InputStream in=new FileInputStream(file);

            StringBuffer buffer=new StringBuffer();
            buffer.append(boundary).append("\n")
                  .append("Content-Disposition: form-data; name=\"pic1\"; filename=\""+file.getName()).append("\"\n")
                  .append("Content-Type: image/pjpeg").append("\n")
                  .append("\n");

            System.out.println(buffer.toString());

            String tmpstr=Utils.getStringFromStream(in);
            tmpstr=Base64Encoder.encode(tmpstr.getBytes());
            buffer.append(tmpstr).append("\n");
            buffer.append(boundary+"--").append("\n");

            System.out.println(buffer.toString());

            in=new ByteArrayInputStream(buffer.toString().getBytes());

            InputStreamEntity ise=new InputStreamEntity(in,buffer.toString().getBytes().length);  

            postMethod.setEntity(ise);  

            response=client.execute(postMethod);
        } catch (ClientProtocolException e) {
            e.printStackTrace();
        } catch (IOException e) {
            e.printStackTrace();
        }
        return response;
    }
    /*
     * params :
     * httpresponse
     * return : 响应的头部信息
     */

    public static List<Header> getReponseHeaders(HttpResponse response){
        List<Header> headers=null;
        Header[] hds=response.getAllHeaders();
        if(hds!=null && hds.length>0){
            headers=new ArrayList<Header>();
            for(int i=0;i<hds.length;i++){
                headers.add(hds[i]);
            }
        }
        return headers;
    }

    /*
      * params :
      * headers:头部信息
      * request:请求
     */
    public static void setHeaders(Map<String,String> headers,HttpUriRequest request){
        if(headers!=null && headers.keySet().size()>0){
            for(String key:headers.keySet()){
                request.addHeader(key, headers.get(key));            }
        }
    }

    /*
     * params :
     * httpresponse
     * return : 响应的cookies值
     */

    public static List<Cookie> getResponseCookies(HttpResponse response){
        List<Cookie> cookies=null;
        Header[] hds=response.getAllHeaders();
        if(hds!=null && hds.length>0){
            for(int i=0;i<hds.length;i++){
                if(hds[i].getName().equalsIgnoreCase("Set-Cookie")){
                    if(cookies==null){
                        cookies=new ArrayList<Cookie>();
                    }
                    String cookiestring[]=hds[i].getValue().split(";");
                    String ss[]=cookiestring[0].split("=",2);
                    String cookiename=ss[0];
                    String cookievalue=ss[1];
                    Cookie cookie=new BasicClientCookie(cookiename,cookievalue);
                    cookies.add(cookie);
                }
            }
        }
        return cookies;
    }
    /*
     * params :
     * cookies数组
     * return : cookies数组组成的字符串
     */
    public static String setCookie2String(List<Cookie> cookies){
        StringBuilder builder=null;
        if(cookies!=null && cookies.size()>0){
            builder=new StringBuilder();
            for(int j=0;j<cookies.size();j++){
                Cookie c=cookies.get(j);
                builder.append(c.getName()+"="+c.getValue());
                if(j!=cookies.size()-1)
                    builder.append("; ");
             }
            return builder.toString();
        }
        return null;
    }

    /*
     * 从响应中得到输入流
     */
    public static InputStream getInputStreamFromResponse(HttpResponse response){
        if(response==null){
            return null;
        }
        HttpEntity entity=response.getEntity();
        InputStream in=null;
        try {
            in = entity.getContent();
        } catch (IllegalStateException e) {
            e.printStackTrace();
        } catch (IOException e) {
            e.printStackTrace();
        }
        return  in;
    }

    /*
     * 从响应中得到字符串
     */
    public static String getStringFromResponse(HttpResponse response){
        if(response==null){
            return null;
        }
        InputStream in=getInputStreamFromResponse(response);
        String responseText="";
        if(in!=null){
            responseText=Utils.getStringFromStream(in);
        }
        return responseText;
    }

    /**
     * 创建支持多线程并发连接的HTTPCLIENT
     */
    private final static HttpClient createHttpClient() {
         String proxyHost = "web-proxy-sha.chn.hp.com";
         int proxyPort = 8080;
         HttpHost proxy = new HttpHost(proxyHost,proxyPort);
        HttpParams params = new BasicHttpParams();
        if(Constant.enableProxy){
            params.setParameter(ConnRouteParams.DEFAULT_PROXY, proxy);
        }
        HttpProtocolParams.setVersion(params, HttpVersion.HTTP_1_1);
        HttpProtocolParams.setContentCharset(params, "UTF-8");

        ThreadSafeClientConnManager clientmanager = new ThreadSafeClientConnManager();
        clientmanager.setMaxTotal(20);
        HttpClient client = new DefaultHttpClient(clientmanager, params);

        //定义了环形重定向,定向到相同的路径是否被允许.
        client.getParams().setParameter("http.protocol.allow-circular-redirects", true); 

        //定义了重定向的最大数量
        client.getParams().setParameter("http.protocol.max-redirects", 50);

        //定义了重定向是否应该自动处理
        client.getParams().setParameter("http.protocol.handle-redirects", false);
        return client;
    }

    /**
     *加入代理的功能
     * @return HttpClient 对象
     */
    public static HttpClient getDefaultHttpClientByProxy() {
        HttpClient httpclient =createHttpClient();
        String filePath = "proxy.properties";
        HttpHost proxy = null;
        Map<String, String> map = ReadIni.getDbini(filePath);
        if (map.size() == 0) {
            throw new RuntimeException("无可用代理");
        } else {
            Set<String> set = map.keySet();
            String[] array = (String[]) set.toArray(new String[set.size()]);
            Random r = new Random();
            int rnum = r.nextInt(array.length);
            String ip = array[rnum];
            String port = map.get(ip);
            proxy = new HttpHost(ip, Integer.parseInt(port));
        }
        httpclient.getParams().setParameter(ConnRoutePNames.DEFAULT_PROXY,proxy);
        httpclient.getParams().setParameter(CoreProtocolPNames.PROTOCOL_VERSION, HttpVersion.HTTP_1_1);
        return httpclient;
    }
}

接下来卡一个HttpHelper的辅助类,如下:

/**
 *
 */
package main.java.sina.utils;

import java.io.IOException;
import org.apache.commons.httpclient.HttpClient;
import org.apache.commons.httpclient.HttpException;
import org.apache.commons.httpclient.methods.PostMethod;

/**
 * @ClassName: HttpHelper
 *
 */
public class HttpHelper {
    public static String getLiveData(String requestData,String url)
            throws HttpException, IOException {
        PostMethod postMethod = new PostMethod(url);
        postMethod.setParameter("mids", requestData);
        HttpClient httpClient = new HttpClient();
        int statusCode = httpClient.executeMethod(postMethod);
        String response = postMethod.getResponseBodyAsString();
        postMethod.releaseConnection();
        System.out.println(response);
        return response;
    }

    public static String getHobbyData(String userid, String hobbys)
            throws HttpException, IOException {
        PostMethod postMethod = new PostMethod("http://c0048925.itcs.hp.com:8080/connector/loadingHobby");
        postMethod.setParameter("userid", userid);
        postMethod.setParameter("hobbys", hobbys);
        HttpClient httpClient = new HttpClient();
        int statusCode = httpClient.executeMethod(postMethod);
        String response = postMethod.getResponseBodyAsString();
        postMethod.releaseConnection();
        System.out.println(response);
        return response;
    }

}

ReadIni.java类,在读文本文件中使用,如下:

package main.java.sina.utils;

import java.io.BufferedReader;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.InputStreamReader;
import java.util.HashMap;
import java.util.Map;

public class ReadIni {

    public static Map<String, String> getDbini(String file) {
        Map<String, String> map = new HashMap<String, String>();
        InputStreamReader isr = null;
        try{
            isr = new InputStreamReader(new FileInputStream(file));
        } catch (FileNotFoundException e1) {
            e1.printStackTrace();
        }
        BufferedReader br = new BufferedReader(isr);
        String s = null;
        try {
            s = br.readLine();
            while (s != null) {
                if (s.trim().length() > 0) {
                    String[] s1 = getIni(s);
                    map.put(s1[0], s1[1]);
                    s = br.readLine();
                }
            }
            br.close();
            isr.close();
        } catch (Exception e) {
            e.printStackTrace();
        }
        return map;
    }

    public static String[] getIni(String str) {
        String[] temp = str.split("=");
        return temp;
    }

}

然后,我们跳转到登录sina,来看一下loginSina这个类的实现:

package main.java.sina.httpclient;

import java.io.IOException;
import java.io.InputStream;
import java.io.UnsupportedEncodingException;
import java.math.BigInteger;
import java.security.InvalidKeyException;
import java.security.KeyFactory;
import java.security.NoSuchAlgorithmException;
import java.security.interfaces.RSAPublicKey;
import java.security.spec.InvalidKeySpecException;
import java.security.spec.RSAPublicKeySpec;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Scanner;

import javax.crypto.BadPaddingException;
import javax.crypto.Cipher;
import javax.crypto.IllegalBlockSizeException;
import javax.crypto.NoSuchPaddingException;

import org.apache.commons.codec.binary.Hex;
import org.apache.commons.httpclient.params.HttpParams;
import org.apache.http.HttpResponse;
import org.apache.http.client.HttpClient;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.cookie.Cookie;
import org.springframework.core.io.ClassPathResource;

import main.java.sina.json.msg.PreLoginResponseMessage;
import main.java.sina.utils.Base64Encoder;
import main.java.sina.utils.EncodeUtils;
import main.java.sina.utils.HttpUtils;
import main.java.sina.utils.JsonUtils;
import main.java.sina.utils.Utils;

public class LoginSina {
    private String username;
    private String password;
    private String rsakv;
    private String pubkey;

    //servertime和nonce都是在登录时需要使用的,用于post信息的加密
    private String servertime;//服务器的时间
    private String nonce;//一次性字符串
    private String userid;//用户微博ID
    private String pcid;//若需要输入验证码时用到
    private String userdomainname;//用于域名
    private String door;//验证码

    private Map<String,String> headers=null;

    private List<Cookie> cookies=null;

    public LoginSina(String username,String password){
        this.username=username;
        this.password=password;
        init();
    }

    public Map<String,String> getHeaders(){
        Map<String,String> hds=null;
        if(headers!=null && headers.keySet().size()>0){
            hds=new HashMap<String,String>();
            for(String key:headers.keySet()){
                hds.put(key,headers.get(key));
            }
        }
        return hds;
    }

    public List<Cookie> getCookies(){
        List<Cookie> cc=null;
        if(cookies!=null && cookies.size()>0){
            cc=new ArrayList<Cookie>();
            for(int i=0;i<cookies.size();i++){
                cc.add(cookies.get(i));
            }
        }
        return cc;
    }
    //登录微博
    public String dologinSina(){
        System.out.println("---do login, please hold on...---");
        String url="http://login.sina.com.cn/sso/login.php?client=ssologin.js(v1.4.5)";//v1.3.17
        Map<String,String> headers=new HashMap<String,String>();
        Map<String,String> params=new HashMap<String,String>();

        /*HTTP协议中的headers:http://www.cnblogs.com/yuzhongwusan/archive/2011/10/20/2218954.html
         * */
        headers.put("Accept", "text/html, application/xhtml+xml, */*");
        headers.put("Referer", "http://login.sina.com.cn/member/my.php?entry=sso");
        headers.put("Accept-Language", "zh-cn");
        headers.put("User-Agent", "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0; BOIE9;ZHCN");
        headers.put("Host", "login.sina.com.cn");
        headers.put("Connection", "Keep-Alive");
        headers.put("Content-Type", "application/x-www-form-urlencoded");
        headers.put("Cache-Control", "no-cache");
        params.put("encoding", "UTF-8");
        params.put("entry", "weibo");
        params.put("from", "");
        params.put("prelt", "112");
        params.put("gateway", "1");
        params.put("nonce", nonce);
        params.put("pwencode", "rsa2");//wsse
        params.put("returntype", "META");
        params.put("pagerefer", "");
        params.put("savestate", "7");
        params.put("servertime", servertime);
        params.put("rsakv", rsakv);
        params.put("service", "miniblog");
        params.put("sp", getEncryptedP());
        params.put("ssosimplelogin", "1");
        params.put("su", getEncodedU());
        params.put("url", "http://weibo.com/ajaxlogin.php?framelogin=1&callback=parent.sinaSSOController.feedBackUrlCallBack");
        params.put("useticket", "1");
        params.put("vsnf", "1");
        HttpResponse response=HttpUtils.doPost(url, headers, params);
        this.cookies=HttpUtils.getResponseCookies(response);
        this.headers=headers;
        String responseText=HttpUtils.getStringFromResponse(response);
        try {
            responseText=new String(responseText.getBytes(),"GBK");
            if(!responseText.contains("retcode=0")){
                downloadCheckImage();
                this.nonce=getnonce();
                Scanner s=new Scanner(System.in);
                if(responseText.contains("retcode=4049"))
                    System.out.println("请输入验证码:");
                else if(responseText.contains("retcode=2070")){
                    System.out.println("验证码不正确,请再次输入验证码:");
                }
                this.door=s.next();
                dologinSina();
            }
        } catch (UnsupportedEncodingException e) {
            e.printStackTrace();
        }
        System.out.println("Congratulations, you have login success!");
        return responseText;
    }
    //登录后重定向
    public String redirect(){
        String cookieValue=HttpUtils.setCookie2String(this.cookies);
        this.headers.clear();
        this.headers.put("Accept", "image/gif, image/jpeg, image/pjpeg, image/pjpeg, application/x-shockwave-flash, application/vnd.ms-excel, application/vnd.ms-powerpoint, application/msword, */*");
        this.headers.put("Accept-Language", "zh-cn");
        this.headers.put("Connection", "Keep-Alive");
        this.headers.put("Host", "sina.com.cn");
        this.headers.put("Referer", "http://login.sina.com.cn/sso/login.php?client=ssologin.js(v1.4.15)");
        this.headers.put("User", "Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 5.1; Trident/4.0; QQDownload 691)");
        this.headers.put("Cookie", cookieValue);
        String ssosavestate=""; //SSO即Sina Sign-on,
        String ticket = "";
        for(Cookie c:this.cookies){
            if(c.getName().equals("ALF")){
                ssosavestate=c.getValue();
            }else if(c.getName().equals("tgc")){
                ticket=c.getValue();
            }
        }
        String url="http://weibo.com/ajaxlogin.php?" +
                "framelogin=1&callback=parent.sinaSSOController.feedBackUrlCallBack&" +
                "sudaref=weibo.com";
        HttpResponse response=HttpUtils.doGet(url, this.headers);
        response=HttpUtils.doGet(url, this.headers);
        String responseText=HttpUtils.getStringFromResponse(response);
        return responseText;
    }
    //生成一次性的字符串 6位 用于加密
    private String getnonce() {
        String x = "ABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789";
        String str = "";
        for (int i = 0; i < 6; i++) {
            str += x.charAt((int)Math.ceil(Math.random() * 1000000) % x.length());
        }
        return str;
    }
    //初始化:得到服务区的时间servertime和一次性字符串nonce
    private void init(){
        String url=compositeUrl();
        Map<String,String> headers=new HashMap<String,String>();
        headers.put("Accept", "*/*");
        headers.put("Referer", "http://weibo.com/");
        headers.put("Accept-Language", "zh-cn");
        headers.put("User-Agent", "Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 5.1; Trident/4.0; QQDownload 691)");
        headers.put("Host", "login.sina.com.cn");
        headers.put("Connection", "Keep-Alive");
        HttpResponse response=HttpUtils.doGet(url, headers);
        String responseText=HttpUtils.getStringFromResponse(response);
        int begin=responseText.indexOf("{");
        int end=responseText.lastIndexOf("}");
        responseText=responseText.substring(begin,end+1);
        PreLoginResponseMessage plrmsg =JsonUtils.jsontoPreLoginResponseMessage(responseText);
        this.nonce=plrmsg.getNonce();
        this.servertime=plrmsg.getServertime()+"";
        this.pubkey=plrmsg.getPubkey();
        this.rsakv=plrmsg.getRsakv();
        this.pcid=plrmsg.getPcid();
    }
    //下载验证码
    private void downloadCheckImage() {
        if(pcid==null) return;
        this.headers.remove("Content-Type");
        try {
            if(this.cookies != null){
                this.cookies.clear();
            }

        } catch (Exception e) {
            e.printStackTrace();
        }
        String cookieValue=HttpUtils.setCookie2String(this.cookies);
        this.headers.put("Cookie", cookieValue);
        String url="http://login.sina.com.cn/cgi/pin.php?r="+(long)(Math.random()*100000000)+"&s=0&p="+this.pcid;
        HttpResponse response=HttpUtils.doGet(url, headers);
        InputStream in=HttpUtils.getInputStreamFromResponse(response);
        try {
            //System.out.println(new ClassPathResource("checkImage.jpeg").getFile().getPath());
            Utils.writeFileFromStream(new ClassPathResource("checkImage.jpeg").getFile().getPath(), in);
        } catch (IOException e) {
            e.printStackTrace();
        }
    }
    //组合预登陆时的URL
    private String compositeUrl(){
        StringBuilder builder=new StringBuilder();
        builder.append("http://login.sina.com.cn/sso/prelogin.php?")
           .append("entry=weibo&callback=sinaSSOController.preloginCallBack&")
           .append("su="+getEncodedU())
           .append("&rsakt=mod&checkpin=1&client=ssologin.js(v1.4.5)&_="+System.currentTimeMillis());
        return builder.toString();
    }
    //对用户名进行编码
    private String getEncodedU() {
        if(username!=null && username.length()>0){
            return Base64Encoder.encode(EncodeUtils.encodeURL(username,"utf-8").getBytes());
        }
        return "";
    }
    //对密码进行编码
    private String getEncryptedP(){
//        return EncodeSuAndSp.getEncryptedP(password, servertime, nonce);
        String data=servertime+"\t"+nonce+"\n"+password;
        String spT=rsaCrypt(pubkey, "10001", data);
        return spT;
    }

    public static String rsaCrypt(String pubkey, String exponentHex, String pwd,String servertime,String nonce) {
          String data=servertime+"\t"+nonce+"\n"+pwd;
          return rsaCrypt(pubkey,exponentHex,data);
    }

    public static String rsaCrypt(String pubkey, String exponentHex, String messageg) {
            KeyFactory factory=null;
            try {
                factory = KeyFactory.getInstance("RSA");
            } catch (NoSuchAlgorithmException e1) {
                return "";
            }
            BigInteger publicExponent = new BigInteger(pubkey, 16); /* public exponent */
            BigInteger modulus = new BigInteger(exponentHex, 16); /* modulus */
            RSAPublicKeySpec spec = new RSAPublicKeySpec(publicExponent, modulus);
            RSAPublicKey pub=null;
            try {
                pub = (RSAPublicKey) factory.generatePublic(spec);
            } catch (InvalidKeySpecException e1) {
                return "";
            }
            Cipher enc=null;
            byte[] encryptedContentKey =null;
            try {
                enc = Cipher.getInstance("RSA");
                enc.init(Cipher.ENCRYPT_MODE, pub);
                encryptedContentKey = enc.doFinal(messageg.getBytes());
            } catch (NoSuchAlgorithmException e1) {
                System.out.println(e1.getMessage());
                return "";
            } catch (NoSuchPaddingException e1) {
                System.out.println(e1.getMessage());
                return "";
            } catch (InvalidKeyException e1) {
                System.out.println(e1.getMessage());
                return "";
            } catch (IllegalBlockSizeException e1) {
                System.out.println(e1.getMessage());
                return "";
            } catch (BadPaddingException e1) {
                System.out.println(e1.getMessage());
                return "";
            }
            return new String(Hex.encodeHex(encryptedContentKey));
    }
    public void setUserid(String userid) {
        this.userid = userid;
    }

    public String getUserid() {
        return userid;
    }

    public void setUserdomainname(String userdomainname) {
        this.userdomainname = userdomainname;
    }

    public String getUserdomainname() {
        return userdomainname;
    }

}

Spider.sina类如下:

  1 package main.java.sina.httpclient;
  2 import java.util.HashMap;
  3 import java.util.List;
  4 import java.util.Map;
  5
  6 import org.apache.http.HttpResponse;
  7 import org.apache.http.cookie.Cookie;
  8
  9 import main.java.sina.utils.Constant;
 10 import main.java.sina.utils.EncodeUtils;
 11 import main.java.sina.utils.HttpUtils;
 12 import main.java.sina.utils.Utils;
 13
 14 public class SpiderSina {
 15     private LoginSina ls;
 16     private Map<String,String> headers;
 17     private final int  ADDFOLLOWING =1;
 18     private final int  CANCELFOLLOWING =2;
 19     public SpiderSina(LoginSina ls){
 20         this.ls=ls;
 21         this.headers=new HashMap<String,String>();
 22         headers.put("Accept", "text/html, application/xhtml+xml, */*");
 23         headers.put("Accept-Language", "zh-cn");
 24         headers.put("User-Agent", "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0; BOIE9;ZHCN");
 25         headers.put("Connection", "Keep-Alive");
 26         headers.put("Cache-Control", "no-cache");
 27         String cookieValue=HttpUtils.setCookie2String(ls.getCookies());
 28         headers.put("Cookie", cookieValue);
 29     }
 36     public String getGroupCategory(){
 37         String url="http://q.weibo.com/";
 38         this.headers.put("Host", "q.weibo.com");
 39         HttpResponse response=HttpUtils.doGet(url, headers);
 40         String responseText=HttpUtils.getStringFromResponse(response);
 41         responseText=EncodeUtils.unicdoeToGB2312(responseText);
 42         return responseText;
 43     }
 44     public String search(String keyword, int pageNo){
 47         String url="http://s.weibo.com/weibo/%25E5%25AE%2581%25E6%25B3%25A2%25E5%25A4%25A7%25E5%25AD%25A6&page="+pageNo;
 48         String cookieValue = "SINAGLOBAL=8556698272004.724.1417744632425; [email protected]; myuid=5439352084; wvr=6; [email protected]; _s_tentry=developer.51cto.com; SWB=usrmdinst_14; SUS=SID-5438576807-1419173757-GZ-lrze7-d8e1e3f082b428c12412c8ba30f0a6de; SUE=es%3D4cdfdd5d5f0f75141c092b32f89525a2%26ev%3Dv1%26es2%3D469e50c869315e57efeec3012c3bb6a8%26rs0%3DoWdG36CQ33LUEtKTvGn907Zy1mwFETvSVJsxeHEiaMPcKDB7pFxg596a2pLhFLJfQmswf4AvXYAkzTfemrYgWrz%252BQPustEA2wLNYufYpAZqFsGWanhTBq6elzB2yoZp41xcpy1WwXn1CuvzIzzEYpuILjHahkmJDQDQy6KaxlbA%253D%26rv%3D0; SUP=cv%3D1%26bt%3D1419173757%26et%3D1419260157%26d%3Dc909%26i%3Da6de%26us%3D1%26vf%3D0%26vt%3D0%26ac%3D27%26st%3D0%26uid%3D5438576807%26name%3Dsm2014121904%2540126.com%26nick%3DSocialMedia%25E5%259B%259B%25E5%25A8%2583%26fmp%3D%26lcp%3D; SUB=_2A255kq8tDeTxGeNK6FoU9yjEyzuIHXVa6DVlrDV8PUNbvtBeLW3TkW-bMoi0G_bBfpbS3TMqcXg6zDWFLA..; SUBP=0033WrSXqPxfM725Ws9jqgMF55529P9D9WhGThsH46uNrx1VY0ApV0SR5JpX5KMt; ALF=1450709756; SSOLoginState=1419173757; WBStore=bc5ad8450c3f8a48|undefined; Apache=1027467835228.8901.1419173761694; ULV=1419173761704:6:6:1:1027467835228.8901.1419173761694:1418797827169; UOR=www.ilehao.com,widget.weibo.com,login.sina.com.cn; ULOGIN_IMG=14192385783486";
 49         headers.put("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8");
 50         //headers.put("Accept-Encoding", "gzip, deflate, sdch");
 51         headers.put("Accept-Language", "zh-CN");
 52         headers.put("User-Agent", "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.71 Safari/537.36");
 53         headers.put("Connection", "Keep-Alive");
 54         headers.put("Cache-Control", "max-age=0");
 55         headers.put("Referer", "http://login.sina.com.cn/sso/login.php?url=http%3A%2F%2Fs.weibo.com%2Fweibo%2F%2525E6%252583%2525A0%2525E6%252599%2525AE%26page%3D2&_rand=1419173756.6387&gateway=1&service=weibo&entry=miniblog&useticket=1&returntype=META");
 56         headers.put("Cookie", cookieValue);
 57         this.headers.put("Host", "s.weibo.com");
 58         HttpResponse response=HttpUtils.doGet(url, headers);
 59         String responseText=HttpUtils.getStringFromResponse(response);
 60         responseText=EncodeUtils.unicdoeToGB2312(responseText);
 61
 62
 63         return responseText;
 64     }
 65
 66     public String searchCommentsByUid(String uid){
 67
 68         String url="http://www.weibo.com/u/"+uid;
 69         String cookieValue = "SINAGLOBAL=8556698272004.724.1417744632425; myuid=2035860051; wvr=6; YF-Ugrow-G0=ad06784f6deda07eea88e095402e4243; SSOLoginState=1423150079; YF-V5-G0=32eb5467e9bfc8b60c2d771056535ac5; _s_tentry=www.weibo.com; Apache=6264929557219.147.1423150103832; ULV=1423150103842:18:2:2:6264929557219.147.1423150103832:1422769721265; ULOGIN_IMG=1423233797946; YF-Page-G0=82cdcdfb16327a659fbb60cc9368fb19; SUS=SID-2035860051-1423286223-GZ-jdkh4-c8ea11de0a42151313986e52f9aa6017; SUE=es%3D8701ff5aca59244ff1ff263cf985bee6%26ev%3Dv1%26es2%3D7995c9eb7455697c09fac4f7486e14eb%26rs0%3DTyXXIRjcEw%252BeS5PaVSM%252FhQjc2JGhKBOe3uFTgShiIUAbPFI2eKtrgxM2wIi9A1xndiTFFM72zY%252FDKYFXONrgkao5cRo%252FHkydV%252FnaQjNmXoeESu5gi6Iq0aX883NhGR0utBVNZb5XaIG3X6HMMfBJC%252B7pnVHogEo8eD6cx8nzN5c%253D%26rv%3D0; SUP=cv%3D1%26bt%3D1423286223%26et%3D1423372623%26d%3Dc909%26i%3D6017%26us%3D1%26vf%3D0%26vt%3D0%26ac%3D0%26st%3D0%26uid%3D2035860051%26name%3Dshy_annan%2540126.com%26nick%3D%25E7%2594%25A8%25E6%2588%25B72035860051%26fmp%3D%26lcp%3D2013-08-18%252021%253A48%253A10; SUB=_2A2550e-fDeTxGeRO6FcZ9i7Mzj2IHXVap0ZXrDV8PUNbvtBuLWnTkW-gBGVORTA7J_lSZzAqzW6E50JjBQ..; SUBP=0033WrSXqPxfM725Ws9jqgMF55529P9D9Wh7oKNCGYcNnhlC6eqqQbbl5JpX5KMt; SUHB=0M20OGRPiOKzyc; ALF=1454822222; UOR=www.ilehao.com,widget.weibo.com,login.sina.com.cn";
 70         headers.put("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8");
 71         headers.put("Accept-Language", "zh-CN");
 72         headers.put("User-Agent", "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.71 Safari/537.36");
 73         headers.put("Connection", "Keep-Alive");
 74         headers.put("Cache-Control", "max-age=0");
 75         headers.put("Cookie", cookieValue);
 76         this.headers.put("Host", "www.weibo.com");
 77         HttpResponse response=HttpUtils.doGet(url, headers);
 78         String responseText=HttpUtils.getStringFromResponse(response);
 79         responseText=EncodeUtils.unicdoeToGB2312(responseText);
 82         return responseText;
 83     }
 85 //爬虫根据关键字,查询时间断,和查询页数  来得到htmlContent
 86 public String search(String keyword, int pageNo, String fromdate,String todate){
 87     StringBuffer stringBuffer = new StringBuffer(200);
 93     stringBuffer.append("http://s.weibo.com/weibo/"+ keyword +"&page=");
 94     stringBuffer.append(pageNo);
 95     stringBuffer.append("&typeall=1&suball=1&timescope=custom:");
 96     stringBuffer.append(fromdate);
 97     stringBuffer.append(":");
 98     stringBuffer.append(todate);
 99     stringBuffer.append("&Refer=g");
104     String url = stringBuffer.toString();
105     String cookieValue = headers.get("Cookie");
106     headers.put("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8");
107     //headers.put("Accept-Encoding", "gzip, deflate, sdch");
108     headers.put("Accept-Language", "zh-CN");
109     headers.put("User-Agent", "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.71 Safari/537.36");
110     headers.put("Connection", "Keep-Alive");
111     headers.put("Cache-Control", "max-age=0");
112     headers.put("Referer", "http://s.weibo.com/weibo/%25E5%25AE%2581%25E6%25B3%25A2%25E5%25A4%25A7%25E5%25AD%25A6&typeall=1&suball=1&timescope=custom:"+fromdate+":"+todate+"&Refer=g");
113     headers.put("Cookie", cookieValue);
114     this.headers.put("Host", "s.weibo.com");
115     HttpResponse response=HttpUtils.doGet(url, headers);
116     String responseText=HttpUtils.getStringFromResponse(response);
117     responseText=EncodeUtils.unicdoeToGB2312(responseText);
118
119     System.out.println("************htmlContent start***********");
120     System.out.println(responseText);
121     System.out.println("************htmlContent end***********");
125     return responseText;
127 }
129 public void forwardToWeiboPage(){
130     String url = Constant.personalHomePage;
131     headers.put("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8");
133     headers.put("Accept-Language", "zh-CN");
134     headers.put("User-Agent", "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.71 Safari/537.36");
135     headers.put("Connection", "Keep-Alive");
137     this.headers.put("Host", "s.weibo.com");
138     HttpResponse response=HttpUtils.doGet(url, headers);
139     String responseText=HttpUtils.getStringFromResponse(response);
140     responseText=EncodeUtils.unicdoeToGB2312(responseText);
141     List<Cookie> cookies = HttpUtils.getResponseCookies(response);
142     String cookie = HttpUtils.setCookie2String(cookies);
144     headers.put("Cookie", cookie);
146 }
150     public String getGroupCategory(int id){
151         String url="http://q.weibo.com/class/category/?id="+id;
152         this.headers.put("Host", "q.weibo.com");
154         HttpResponse response=HttpUtils.doGet(url, headers);
155         String responseText=HttpUtils.getStringFromResponse(response);
156         responseText=EncodeUtils.unicdoeToGB2312(responseText);
157         return responseText;
158     }
169     //得到微群管理员ID信息,其实用户成员的第一页 HTML页面
170     public String getGroupAdministrator(String groupid) {
171         String url="http://q.weibo.com/"+groupid+"/members/all";
172         this.headers.remove("Referer");
173         this.headers.put("Host", "q.weibo.com");
174         this.headers.remove("Content-Type");
175         this.headers.remove("x-requested-with");
176         HttpResponse response=HttpUtils.doGet(url, headers);
177         String responseText=HttpUtils.getStringFromResponse(response);
178         return responseText;
179     }
180     //根据微群号和页号得到群成员ID信息 -----JSON格式数据
181     public String getGroupMembers(String groupid,int pagenumber){
182         this.headers.put("Referer", "http://q.weibo.com/"+groupid+"/members/all");
183         this.headers.put("Host", "q.weibo.com");
184         this.headers.put("Content-Type", "application/x-www-form-urlencoded");
185         this.headers.put("x-requested-with", "XMLHttpRequest");
187         Map<String,String> params=new HashMap<String,String>();
188         params.put("_t", "0");
189         params.put("page", pagenumber+"");
190         params.put("gid", groupid);
191         params.put("query","");
192         params.put("tab", "all");
193         params.put("vip", "1");
194         String url="http://q.weibo.com/ajax/members/page";
195         HttpResponse response=HttpUtils.doPost(url, headers, params);
196         return HttpUtils.getStringFromResponse(response);
197     }
198     /*
199      *  得到微群中微博信息 经过多次尝试成功
200      *  每次获得50个微博记录,page是页号, count值50 可以在1-75之间,但是,每次开始的时候还是从50的倍数开始的
201      */
202     public String getGroupTopic(int page,int count,String gid){
203         this.headers.put("Referer", "http://q.weibo.com/"+gid);
204         this.headers.put("Host", "q.weibo.com");
205         this.headers.put("Content-Type", "application/x-www-form-urlencoded");
206         this.headers.put("x-requested-with", "XMLHttpRequest");
207         Integer pre_page=1;
208         if(page==1){
209             pre_page=2;
210         }else{
211             pre_page=page-1;
212         }
213         Map<String,String> params=new HashMap<String,String>();
214         params.put("_k", System.currentTimeMillis()+"");
215         params.put("_t", "0");
216         params.put("count", count+"");
217         //params.put("end_id", end_id);
218         params.put("gid", gid);
219         params.put("is_search","");
220         params.put("key_word", "");
221         params.put("me", "0");
222         params.put("mids", "");
223         params.put("new", "0");
224         params.put("page", page+"");
225         params.put("pagebar", "0");
226         params.put("pre_page", pre_page+"");
227         params.put("since_id", "0");
228         params.put("uid", "0");
229
230         String url="http://q.weibo.com/ajax/mblog/groupfeed";
231         HttpResponse response=HttpUtils.doPost(url, headers, params);
232         return HttpUtils.getStringFromResponse(response);
233     }
234     /*
235      *  得到微群中微博信息数目
236      *  这个信息中其实还包含了微群的所有的基本信息~~~~~~~~~~****** json格式的数据信息
237      */
238     public String getGroupMessageNumber(String gid){
239         this.headers.put("Referer", "http://q.weibo.com/"+gid);
240         this.headers.put("Host", "q.weibo.com");
241         this.headers.put("Content-Type", "application/x-www-form-urlencoded");
242         this.headers.put("x-requested-with", "XMLHttpRequest");
243         String url="http://q.weibo.com/ajax/rightnav/groupprofile?gid="+gid+"&_t=0&__rnd="+System.currentTimeMillis();
244         HttpResponse response=HttpUtils.doGet(url, headers);
245         return HttpUtils.getStringFromResponse(response);
246     }
247     //得到微群的主页信息  HTML页码   主要是为了得到第一条微博记录的MID值
248     public String getgroupMainPage(String groupid) {
249         String url="http://q.weibo.com/"+groupid+"?topnav=1";
250         this.headers.remove("Referer");
251         this.headers.put("Host", "q.weibo.com");
252         this.headers.remove("Content-Type");
253         this.headers.remove("x-requested-with");
254
255         HttpResponse response=HttpUtils.doGet(url, headers);
256         String responseText=HttpUtils.getStringFromResponse(response);
257         return responseText;
258     }
259     /*
260      * 根据分类得到微群信息
261      * categroyID :分类ID号
262      * pagenumber:页号
263      * sort:分类方式 1 按成员人数 2按 微群博数 3按创建时间分类
264      * count:每页的记录数目
265      */
266     public String getGroupByCategroy(int categroyID,int pagenumber,int sort,int count){
267         this.headers.put("Referer", "http://q.weibo.com/class/category/?id="+categroyID);
268         this.headers.put("Host", "q.weibo.com");
269         this.headers.put("Content-Type", "application/x-www-form-urlencoded");
270         this.headers.put("x-requested-with", "XMLHttpRequest");
271         Map<String,String> params=new HashMap<String,String>();
272         params.put("_t", "0");
273         params.put("page", pagenumber+"");
274         params.put("id", categroyID+"");
275         params.put("sort",sort+"");
276         params.put("count", count+"");
277
278         String url="http://q.weibo.com/ajax/class/category";
279         HttpResponse response=HttpUtils.doPost(url, headers,params);
280         String responseText=HttpUtils.getStringFromResponse(response);
281         responseText=EncodeUtils.unicdoeToGB2312(responseText);
282         return responseText;
283     }
284     //得到表情列表信息
285     public String getFaceList(){
286         String url="http://weibo.com/aj/mblog/face?type=face&_t=0&__rnd="+System.currentTimeMillis();
287         this.headers.put("Referer", "http://weibo.com/");
288         this.headers.put("Host", "weibo.com");
289         this.headers.put("Content-Type", "application/x-www-form-urlencoded");
290         this.headers.put("x-requested-with", "XMLHttpRequest");
291
292         HttpResponse response=HttpUtils.doGet(url, headers);
293         String responseText=HttpUtils.getStringFromResponse(response);
294         System.out.println(responseText);
295         Utils.writeFileFromString("tmpFile/faceList.txt", responseText);
296         return responseText;
297     }
307     //用户基本信息          主要是将要解析用户主页下方经过编码后的内容
308     public String getMemberInfo(String memberID){
309         String url="http://weibo.com/"+memberID+"/info";
310         this.headers.put("Host", "weibo.com");
311         this.headers.put("Referer", "http://weibo.com/u/"+memberID);
312         HttpResponse response=HttpUtils.doGet(url, headers);
313         String responseText=HttpUtils.getStringFromResponse(response);
314         return responseText;
315     }
316     //用户粉丝用户信息    html页面,每次20个
317     public String getMemberFans(String memberID,int page){
318         String url="http://weibo.com/"+memberID+"/fans?&uid=1689219395&tag=&page="+page;
319         this.headers.put("Host", "weibo.com");
320         this.headers.put("Referer", "http://weibo.com/"+memberID+"/fans");
321         HttpResponse response=HttpUtils.doGet(url, headers);
322         String responseText=HttpUtils.getStringFromResponse(response);
323         return responseText;
324     }
325     //用户关注的用户信息     html页面
326     public String getMemberFollowing(String memberID,int page){
327         String url="http://weibo.com/"+memberID+"/follow?page="+page;
328         this.headers.put("Host", "weibo.com");
329         this.headers.put("Referer", "http://weibo.com/"+memberID+"/follow");
330         HttpResponse response=HttpUtils.doGet(url, headers);
331         String responseText=HttpUtils.getStringFromResponse(response);
332         return responseText;
333     }
334
335     /*
336      *  @params
337      *   memberID:是用户ID
338      *   max_id:每次AJAX获得数据时上面一次的最后一个ID值
339      *   end_id:用户最新的一条微博的ID值
340      *   k:一个随机数
341      *   page:页号
342      *   pre_page:前一页
343      *   count:每次返回的数值  当max_id为null是 count=50 否则为15
344      *      pagebar:ajax时,第一次为0,第二次为1
345      *   注意:
346      *   1  用此请求,每次获得的数据格式都一样,用同样的解析方法来进行解析。
347      *   2 每次一页可以获得总共45条记录,需要三次请求。每次请求可获得15条记录。
348      *   3 max_id可以不用到,直接等于 end_id就可以了.
349      *   4 第一次请求时可以将end_id设置为NUll,即为第一次时翻页时的请求后边的滚动时必须有end_id参数,end_id为第一页的第一条ID即可。
350      */
351     //获得用户发布的微博信息   json格式的数据
352     public String getMemberReleaseTopic(String memberID,String end_id,Integer page,Integer pagebar){
353         String url="";
354         Integer pre_page=1;
355         Integer count=0;
356         String k=System.currentTimeMillis()+""+(int)(Math.random()*100000)%100;
357         if(end_id==null){
358             count=50;
359             if(page==1){
360                 pre_page=2;
361             }else{
362                 pre_page=page-1;
363             }
364             url="http://weibo.com/aj/mblog/mbloglist?" +
365             "page="+page+"&count="+count+"&pre_page="+pre_page+"&" +
366             "_k="+ k+"&uid="+memberID+
367             "&_t=0&__rnd="+System.currentTimeMillis();
368         }else{
369             count=15;
370             pre_page=page;
371             url="http://weibo.com/aj/mblog/mbloglist?" +
372             "page="+page+"&count="+count+"&max_id="+end_id+"&" +
373             "pre_page="+pre_page+"&end_id="+end_id+"&" +
374             "pagebar="+pagebar+"&_k="+k+"&" +
375             "uid="+memberID+"&_t=0&__rnd="+System.currentTimeMillis();
376         }
377         String cookieValue = "SINAGLOBAL=8556698272004.724.1417744632425; [email protected]; myuid=5439352084; YF-Ugrow-G0=4703aa1c27ac0c4bab8fc0fc5968141e; SSOLoginState=1421374583; wvr=6; YF-V5-G0=8c4aa275e8793f05bfb8641c780e617b; _s_tentry=login.sina.com.cn; Apache=2461283528245.9854.1421374588453; ULV=1421374588550:13:5:3:2461283528245.9854.1421374588453:1421210767499; UOR=www.ilehao.com,widget.weibo.com,login.sina.com.cn; SUS=SID-2035860051-1421462085-GZ-7jcgb-1539d643bae5195fb7f792b2ae77befb; SUE=es%3Df15e11ed09b6a0108a28adfa58609b78%26ev%3Dv1%26es2%3Da0f706efac5c89495062648a4de3e337%26rs0%3DZBxlOUv0mhmxyHfOVmZ3tH7tNvAp08BjPeLUJPdu9WzG38Dsm40px%252Bd9w21ycDpZQwBK3q0prFfNs%252F8ZuZSasa1eps%252FOGNxJ3CIHN8JN%252Fik6gVpIPgVeeRdalNWTIbth6hLa34uOp%252BXii%252Bxeib%252BvINsr%252FdOvQx6kjp6fsC44QXc%253D%26rv%3D0; SUP=cv%3D1%26bt%3D1421462085%26et%3D1421548485%26d%3Dc909%26i%3Dbefb%26us%3D1%26vf%3D0%26vt%3D0%26ac%3D2%26st%3D0%26uid%3D2035860051%26name%3Dshy_annan%2540126.com%26nick%3D%25E7%2594%25A8%25E6%2588%25B72035860051%26fmp%3D%26lcp%3D2013-08-18%252021%253A48%253A10; SUB=_2A255vboVDeTxGeRO6FcZ9i7Mzj2IHXVazdpdrDV8PUNbvtBuLVj-kW91jmbQSGo7Rn30RVvGP5KOgBgNgQ..; SUBP=0033WrSXqPxfM725Ws9jqgMF55529P9D9Wh7oKNCGYcNnhlC6eqqQbbl5JpX5KMt; ALF=1452998078; ULOGIN_IMG=14214638933178; YF-Page-G0=0acee381afd48776ab7a56bd67c2e7ac";
378         headers.put("Cookie", cookieValue);
379         this.headers.put("Referer", "http://weibo.com/u/"+memberID);
380         this.headers.put("Host", "www.weibo.com");
381         this.headers.put("Content-Type", "application/x-www-form-urlencoded");
382         this.headers.put("x-requested-with", "XMLHttpRequest");
383         url = "http://weibo.com/u/"+memberID;
384         HttpResponse response=HttpUtils.doGet(url, headers);
385         if(response==null){
386             return "";
387         }
388         return HttpUtils.getStringFromResponse(response);
389     }
390     /*
391      * ~~~~~~~~~~~~~~~~~~~~~获取用户的一些信息~~~end~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
392      */
393
394
395     //**********************************************************************************
396
397     /*
398      *  名人堂与达人信息
399      */
400     public String getVerified(String url){
401         this.headers.put("Host", "verified.weibo.com");
402         this.headers.put("Referer", "http://plaza.weibo.com/?topnav=1&wvr=4");
403         HttpResponse response=HttpUtils.doGet(url, headers);
404         String responseText=HttpUtils.getStringFromResponse(response);
405         return responseText;
406     }
407
408     public String getVerifiedMember(String path,Integer g_index){
409         String url="http://verified.weibo.com/aj/getgrouplist?g_index="+g_index+
410         "&path="+path+"&_t=0&__rnd="+System.currentTimeMillis();
411         this.headers.put("Host", "verified.weibo.com");
412         this.headers.put("Referer", path);
413         this.headers.put("Content-Type", "application/x-www-form-urlencoded");
414         this.headers.put("x-requested-with", "XMLHttpRequest");
415         HttpResponse response=HttpUtils.doGet(url, headers);
416         String responseText=HttpUtils.getStringFromResponse(response);
417
418         return responseText;
419     }
420
421     public String setArea(Integer provinceID){
422         this.headers.put("Referer", "http://club.weibo.com/list");
423         this.headers.put("Host", "club.weibo.com");
424         this.headers.put("Content-Type", "application/x-www-form-urlencoded");
425         this.headers.put("x-requested-with", "XMLHttpRequest");
426
427         Map<String,String> params=new HashMap<String,String>();
428
429         params.put("_t", "0");
430         params.put("city", "1000");
431         params.put("prov", provinceID+"");
432
433         String url="http://club.weibo.com/ajax_setArea.php";
434         HttpResponse response=HttpUtils.doPost(url, headers, params);
435
436         List<Cookie> cks=HttpUtils.getResponseCookies(response);
437         List<Cookie> cookies=ls.getCookies();
438         cookies.addAll(cks);
439         String cookieValue=HttpUtils.setCookie2String(cookies);
440         this.headers.put("Cookie", cookieValue);
441
442         return HttpUtils.getStringFromResponse(response);
443     }
444
445     public String getDaRen(Integer page){
446         String op="ltime";
447         String url="http://club.weibo.com/list?sex=3&op="+op+"&page="+page+"&";
448         Integer pre_page=(page<=1? 2:page-1);
449         this.headers.put("Host", "club.weibo.com");
450         this.headers.put("Referer", "http://club.weibo.com/list?sex=3&op=ltime&page="+pre_page+"&");
451         this.headers.remove("Content-Type");
452         this.headers.remove("x-requested-with");
453
454         HttpResponse response=HttpUtils.doGet(url, headers);
455         if(response!= null){
456             return HttpUtils.getStringFromResponse(response);
457         }
458         return "";
459
460     }
470     //发布一条文字微博
471     public String releaseTopic(String content){
472         this.headers.put("Referer", "http://weibo.com/");
473         this.headers.put("Host", "weibo.com");
474         this.headers.put("Content-Type", "application/x-www-form-urlencoded");
475         this.headers.put("x-requested-with", "XMLHttpRequest");
476         Map<String,String> params=new HashMap<String,String>();
477         params.put("_t", "0");
478         params.put("location", "home");
479         params.put("module", "stissue");
480         params.put("pic_id", "");
481         params.put("text", content);
482         String url="http://weibo.com/aj/mblog/add?__rnd="+System.currentTimeMillis();
483         HttpResponse response=HttpUtils.doPost(url, headers, params);
484         return HttpUtils.getStringFromResponse(response);
485     }519     //得到自己关注的成员
520     public String getSelfFollowIngs(){
521         return "";
522     }
523     //得到自己的粉丝
524     public String getSelfFollowers(){
525         return "";
526     }
527     //得到自己加入的微群
528     public String getSelfJoinedGroups(){
529         return "";
530     }
531     //得到自己的标签
532     public String getSelfTags(){
533         return "";
534     }
535     //得到自己发布的微博
536     public String getSelfReleaseTopics(){
537         return "";
538     }
539     //得到自己主页的微博
540     public String getSelfPageTopics(){
541         return "";
542     }
543     //关注一个人
544     public String addFollowing(String memberid){
545         return addorcancleFollowing(memberid,this.ADDFOLLOWING);
546     }
547     //取消关注一个人
548     public String cancelFollowing(String memberid){
549         return addorcancleFollowing(memberid,this.CANCELFOLLOWING);
550     }
551     private String addorcancleFollowing(String memberid,int option){
552         String url="";
553         switch(option){
554             case ADDFOLLOWING:
555                 url="http://weibo.com/aj/f/followed?__rnd="+System.currentTimeMillis();
556                 break;
557             case CANCELFOLLOWING:
558                 url="http://weibo.com/aj/f/unfollow?__rnd="+System.currentTimeMillis();
559                 break;
560         }
561
562         Map<String,String> params=new HashMap<String,String>();
563
564         this.headers.put("Referer", "http://weibo.com/");
565         this.headers.put("Host", "weibo.com");
566         this.headers.put("Content-Type", "application/x-www-form-urlencoded");
567         this.headers.put("Referer", "http://weibo.com/");
568         this.headers.put("x-requested-with", "XMLHttpRequest");
569
570         params.put("_t", "0");
571         params.put("f", "1");
572         params.put("location", "profile");
573         params.put("refer_flag", "");
574         params.put("refer_sort", "profile");
575         params.put("uid", memberid);
576
577         HttpResponse response=HttpUtils.doPost(url, headers, params);
578         return HttpUtils.getStringFromResponse(response);
579     }
584     /**
585      * 得到的标签信息  调用一次10个
586      * @return
587      */
588     public String getTags(){
589         String url="http://account.weibo.com/set/aj/tagsuggest?__rnd="+System.currentTimeMillis();
590         this.headers.put("Referer", "http://account.weibo.com/set/tag#");
591         this.headers.put("Host", "account.weibo.com");
592         HttpResponse response=HttpUtils.doGet(url, headers);
593         return HttpUtils.getStringFromResponse(response);
594     }
595
596     /**
597      * 得到微博热词信息
598      * @param k :热词的门类
599      */
600     public String getHotWords(String k){
601         String url="http://data.weibo.com/top/keyword?k="+k;
602         try{
603             Integer.parseInt(k);
604         }catch(Exception ex){
605             url="http://data.weibo.com/top/keyword?t="+k;
606         }
607         this.headers.put("Referer", "http://data.weibo.com/top/keyword");
608         this.headers.put("Host", "data.weibo.com");
609         HttpResponse response=HttpUtils.doGet(url, headers);
610         return HttpUtils.getStringFromResponse(response);
611     }
612
613     /**
614      * 得到微博热帖子
615      * @param cat  表示热帖门类
616      * @param page 表示页号
617      */
618     public String getHotWeibo(String cat,int page){
619         String url="http://data.weibo.com/hot/ajax/catfeed?page="+page+"&cat="+cat+"&_t=0&__rnd="+System.currentTimeMillis();
620         this.headers.put("Referer", "http://data.weibo.com/hot/minibloghot");
621         this.headers.put("Host", "data.weibo.com");
622         HttpResponse response=HttpUtils.doGet(url, headers);
623         return HttpUtils.getStringFromResponse(response);
624     }
625
626     /**
627      * 按照分类获取 微博吧名字  第一步
628      */
629     public String getWeiBar(String ctgid,int p){
630         String sort="post";
631         String url="http://weiba.weibo.com/aj_f/CategoryList?sort="+sort+"&p="+p+"&ctgid="+ctgid+"&_t=0&__rnd="+System.currentTimeMillis();
632         this.headers.put("Referer", "http://weiba.weibo.com/ct/"+ctgid);
633         this.headers.put("Host", "weiba.weibo.com");
634         this.headers.put("Accept", "*/*");
635         this.headers.put("Content-Type", "application/x-www-form-urlencoded");
636         this.headers.put("X-Requested-With", "XMLHttpRequest");
637         HttpResponse response=HttpUtils.doGet(url, headers);
638         return HttpUtils.getStringFromResponse(response);
639     }
640     /**
641      * 根据微博吧 名称 ,得到该吧内的所有帖子标题 第二步
642      */
643     public String getWeiBarByWeibarName(String bid,int p){
644         String url="http://weiba.weibo.com/aj_t/postlist?bid="+bid+"&p="+p+"&_t=all&__rnd="+System.currentTimeMillis();
645         this.headers.put("Referer", "http://weiba.weibo.com/");
646         this.headers.put("Host", "weiba.weibo.com");
647         this.headers.put("Accept", "*/*");
648         this.headers.put("Content-Type", "application/x-www-form-urlencoded");
649         this.headers.put("X-Requested-With", "XMLHttpRequest");
650         HttpResponse response=HttpUtils.doGet(url, headers);
651         return HttpUtils.getStringFromResponse(response);
652     }
653
654     /**
655      * 新浪微公益名单
656      * type ="donate"
657      * type="discuss"
658      */
659     public String getWeiGongYiMember(int page,int projectID,String type){
660         String url="http://gongyi.weibo.com/aj_personal_helpdata?page="+page+"&type="+type+"&project_id="+projectID+"&_t=0&__rnd="+System.currentTimeMillis();
661         this.headers.put("Referer", "http://gongyi.weibo.com/"+projectID);
662         this.headers.put("Host", "gongyi.weibo.com");
663         this.headers.put("Accept", "*/*");
664         this.headers.put("Content-Type", "application/x-www-form-urlencoded");
665         this.headers.put("X-Requested-With", "XMLHttpRequest");
666         HttpResponse response=HttpUtils.doGet(url, headers);
667         return HttpUtils.getStringFromResponse(response);
668     }
669 }
时间: 2024-09-28 16:29:12

用java实现新浪爬虫,代码完整剖析(仅针对当前SinaSignOn有效)的相关文章

Java 模拟新浪登录 2016

想学习一下网络爬虫.涉及到模拟登录,查阅了一番资料以后发现大部分都有点过时了,就使用前辈们给的经验,Firefox抓包调试,採用httpclient模拟了一下新浪登录. 不正确之处多多包括.须要的能够用浏览器调试看看还有哪些须要改动的,改改就能够了. 新浪登录认证流程: 1.预登陆获取pubkey/nonce/rsak等用于加密用户信息(get). 返回json 2.login.php?client=ssologin对用户账号进行加密username採用base64加密,password採用rs

【python网络编程】新浪爬虫:关键词搜索爬取微博数据

上学期参加了一个大数据比赛,需要抓取大量数据,于是我从新浪微博下手,本来准备使用新浪的API的,无奈新浪并没有开放关键字搜索的API,所以只能用爬虫来获取了.幸运的是,新浪提供了一个高级搜索功能,为我们爬取数据提供了一个很好的切入点. 在查阅了一些资料,参考了一些爬虫的例子后,得到大体思路:构造URL,爬取网页,然后解析网页 具体往下看~ 登陆新浪微博,进入高级搜索,如图输入,之后发送请求会发现地址栏变为如下:    http://s.weibo.com/weibo/%25E4%25B8%25A

Java HttpClient(4.2) 爬虫代码

package spider; import java.io.BufferedReader; import java.io.ByteArrayOutputStream; import java.io.IOException; import java.io.InputStream; import java.io.InputStreamReader; import java.util.ArrayList; import java.util.Collection; import java.util.D

python语言写的新浪爬虫

用python做了一个爬虫,程序没有错,但是运行结果如下,请问是设么问题?求高手解答D:\eclipse\workspace\sina_spider\Sina_spider1\spiders\spiders.py:5: ScrapyDeprecationWarning: Module `scrapy.spider` is deprecated, use `scrapy.spiders` instead from scrapy.spider import CrawlSpider 2017-04-2

php 新浪通行证、新浪微博模拟统一登录 (后台网页抓取版) 2016

前几天做了一个Java的新浪通行证模拟登录测试.现在给大家一个php的新浪通行证.微博登录的示例:具体都有备注,大家阅读代码吧. <?php /** * tom 2016年4月12日10:37:08 模拟微博登录 */ class login_weibo { // 微博用户名称密码 private $username = ''; private $password = ''; //请求cookie private $request_cookie = ''; //预登陆返回json private

Java 8新特性-4 方法引用

对于引用来说我们一般都是用在对象,而对象引用的特点是:不同的引用对象可以操作同一块内容! Java 8的方法引用定义了四种格式: 引用静态方法     ClassName :: staticMethodName 引用对象方法:  Object:: methodName 引用特定类型方法: ClassName :: methodName 引用构造方法: ClassName  :: new 静态方法引用示例 /** * 静态方法引用 * @param <P> 引用方法的参数类型 * @param

Thinkcmf 在新浪云上的部署问题

最近要开发一个社团主页,于是想到了CMF内容管理系统的,但是直接在自己的服务器测试成本太高,于是选择了在新浪云上进行部署测试. 但是在安装Thinkcmf的过程中产生了一些技术性的问题.但最后终于在自己的测试下解决了这个问题,再此过程中感谢胡明宣,胡哥的帮助.先 讲自己的安装过程总结如下: 1>你要有一个新浪的账号,注册新浪云账号,具体的见新浪云网站http://sae.sina.com.cn/,在新浪云里创建一个应用假设为demo 2>你要有一个Thinkcmf的安装包,具体版本选择见Thi

python爬虫:使用urllib.request和BeautifulSoup抓取新浪新闻标题、链接和主要内容

案例一 抓取对象: 新浪国内新闻(http://news.sina.com.cn/china/),该列表中的标题名称.时间.链接. 完整代码: from bs4 import BeautifulSoup import requests url = 'http://news.sina.com.cn/china/' web_data = requests.get(url) web_data.encoding = 'utf-8' soup = BeautifulSoup(web_data.text,'

腾讯、网易、新浪新闻网站爬虫编写记录及评论格式分析

0 前言 先说说看这篇博客你能知道什么:1 腾讯.网易.新浪不同新闻的地址格式以及评论内容的地址格式(返回数据为json的异步接口):2 一些比较通用的设计方法,对软件设计的菜鸟可能有帮助: 之前也说了要写这边博客,现在终于写出来了.我的毕业设计的指导老师说毕设论文的字数不够--所以我决定把这些本不应该出现在论文中的实现细节凑到论文中.至于下面说到的东西要解决什么问题,各位可以先看看这个网站(我毕设的初步结果,目前还在优化中,包括代码结构还有UI设计):http://reetseenews.du