新浪微博模拟登陆+数据抓取(java实现)

模拟登陆部分实现:

package token.exe;

import java.math.BigInteger;
import java.util.Random;

import org.apache.commons.codec.binary.Base64;

public class WeiboEncoder {

    private static BigInteger n = null;
    private static BigInteger e = null;

    /**
     * 使用Base64加密用户名(su的获取)
     * @param account
     * @return
     */
    @SuppressWarnings("deprecation")
    public static String encodeAccount(String account){
        return new String(Base64.encodeBase64(account.getBytes()));
    }

    /**
     * 使用RSAEncrypt对用户密码进行加密(sp的获取)
     * @param pwd
     * @param nStr
     * @param eStr
     * @return
     */
    public static String RSAEncrypt(String pwd, String nStr, String eStr){
        n = new BigInteger(nStr,16);
        e = new BigInteger(eStr,16);

        BigInteger r = RSADoPublic(pkcs1pad2(pwd,(n.bitLength()+7)>>3));
        String sp = r.toString(16);
        if((sp.length()&1) != 0 )
            sp = "0" + sp;
        return sp;
    }

    private static BigInteger RSADoPublic(BigInteger x){
         return x.modPow(e, n);
    }

    private static BigInteger pkcs1pad2(String s, int n){
        if(n < s.length() + 11) { // TODO: fix for utf-8
            System.err.println("Message too long for RSA");
            return null;
          }
        byte[] ba = new byte[n];
        int i = s.length()-1;
        while(i >= 0 && n > 0) {
            int c = s.codePointAt(i--);
            if(c < 128) { // encode using utf-8
              ba[--n] = new Byte(String.valueOf(c));
            }
            else if((c > 127) && (c < 2048)) {
              ba[--n] = new Byte(String.valueOf((c & 63) | 128));
              ba[--n] = new Byte(String.valueOf((c >> 6) | 192));
            }
            else {
              ba[--n] = new Byte(String.valueOf((c & 63) | 128));
              ba[--n] = new Byte(String.valueOf(((c >> 6) & 63) | 128));
              ba[--n] = new Byte(String.valueOf((c >> 12) | 224));
            }
          }
        ba[--n] = new Byte("0");

        byte[] temp = new byte[1];
        Random rdm = new Random(47L);

        while(n > 2) { // random non-zero pad
            temp[0] = new Byte("0");
            while(temp[0] == 0)
                rdm.nextBytes(temp);
            ba[--n] = temp[0];
        }
        ba[--n] = 2;
        ba[--n] = 0;

        return new BigInteger(ba);
    }

}

 参数实体:

package token.def;

import java.io.Serializable;

public class LoginParams implements Serializable {

    private static final long serialVersionUID = -5775728968372860382L;
    private String pcid;
    private String servertime;
    private String nonce;
    private String rsakv;
    private String imgUrl;
    private String sp;
    private String code;
    private boolean isLogin = true;

    public String getPcid() {
        return pcid;
    }

    public void setPcid(String pcid) {
        this.pcid = pcid;
    }

    public String getServertime() {
        return servertime;
    }

    public void setServertime(String servertime) {
        this.servertime = servertime;
    }

    public String getNonce() {
        return nonce;
    }
    public void setNonce(String nonce) {
        this.nonce = nonce;
    }

    public String getRsakv() {
        return rsakv;
    }

    public void setRsakv(String rsakv) {
        this.rsakv = rsakv;
    }

    public String getImgUrl() {
        return imgUrl;
    }

    public void setImgUrl(String imgUrl) {
        this.imgUrl = imgUrl;
    }

    public String getSp() {
        return sp;
    }

    public void setSp(String sp) {
        this.sp = sp;
    }

    public String getCode() {
        return code;
    }

    public void setCode(String code) {
        this.code = code;
    }

    public boolean isLogin() {
        return isLogin;
    }

    public void setLogin(boolean isLogin) {
        this.isLogin = isLogin;
    }

    @Override
    public String toString() {
        return "LoginParams [pcid=" + pcid + ", servertime=" + servertime
                + ", nonce=" + nonce + ", rsakv=" + rsakv + ", imgUrl="
                + imgUrl + ", sp=" + sp + ", code=" + code + ", isLogin="
                + isLogin + "]";
    }

}
登陆部分实现:

package token.exe;

import java.io.FileOutputStream;
import java.io.IOException;
import java.net.URLEncoder;
import java.security.KeyManagementException;
import java.security.NoSuchAlgorithmException;
import java.util.ArrayList;
import java.util.Date;
import java.util.HashMap;
import java.util.List;
import java.util.Properties;
import java.util.Scanner;

import org.apache.commons.httpclient.Header;
import org.apache.commons.httpclient.HttpClient;
import org.apache.commons.httpclient.HttpException;
import org.apache.commons.httpclient.HttpStatus;
import org.apache.commons.httpclient.HttpVersion;
import org.apache.commons.httpclient.MultiThreadedHttpConnectionManager;
import org.apache.commons.httpclient.NameValuePair;
import org.apache.commons.httpclient.cookie.CookiePolicy;
import org.apache.commons.httpclient.methods.GetMethod;
import org.apache.commons.httpclient.methods.PostMethod;
import org.apache.commons.httpclient.params.HttpClientParams;
import org.apache.commons.httpclient.params.HttpConnectionManagerParams;
import org.apache.commons.httpclient.protocol.Protocol;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;

import token.SinaWeiboOAuth;
import token.def.LoginParams;
import weibo4j.model.MySSLSocketFactory;

public class WeiboLoginer {

    private HttpClient httpClient; //httpClient实例初始化

    public  WeiboLoginer() {

        //httpclient连接配置
        MultiThreadedHttpConnectionManager httpManager = new MultiThreadedHttpConnectionManager();
        HttpConnectionManagerParams connectParams = httpManager.getParams();
        connectParams.setConnectionTimeout(3000);
        connectParams.setDefaultMaxConnectionsPerHost(100);
        connectParams.setSoTimeout(3000);
        //httpclient参数配置
        HttpClientParams httpParams = new HttpClientParams();
        httpParams.setCookiePolicy(CookiePolicy.BROWSER_COMPATIBILITY);
        httpParams.setVersion(HttpVersion.HTTP_1_1);
        //设置默认Header
        List<Header> headers = new ArrayList<Header>();
        headers.add(new Header("Content-Type", "application/x-www-form-urlencoded"));
        headers.add(new Header("Host", "login.sina.com.cn"));
        headers.add(new Header("User-Agent","Mozilla/5.0 (Windows NT 6.1; rv:25.0) Gecko/20100101 Firefox/25.0"));
        headers.add(new Header("API-RemoteIP", "192.168.0.1"));//伪造新浪验证IP
        headers.add(new Header("X-Forwarded-For","192.168.0.1"));//伪造真实IP
        headers.add(new Header("CLIENT-IP", "192.168.0.1"));//伪造客户端IP
        //初始化httpclient
        httpClient = new HttpClient(httpParams, httpManager);
        httpClient.getHostConfiguration().getParams().setParameter("http.default-headers", headers);
        //设置ssl协议
        Protocol protocol = new Protocol("https",new MySSLSocketFactory(), 443);
        Protocol.registerProtocol("https", protocol);
        //设置代理
//        httpClient.getHostConfiguration().setProxy("", 0);
//        httpClient.getParams().setAuthenticationPreemptive(false);
    }

    /**
     * 登陆并获取code值,如果出现验证码则返回还有验证码的参数信息
     * @return
     */
    public LoginParams doLogin(String username, String password) {

        Properties properties = initProperties();
        String base64UserCount = WeiboEncoder.encodeAccount(username);
        HashMap<String, String> pubkeyMap = null;
        String sp = null;
        String imgUrl = null;
        LoginParams loginParams = new LoginParams();
        try {
            pubkeyMap = pubKeyMap(base64UserCount);
            sp = WeiboEncoder.RSAEncrypt(password, pubkeyMap.get("pubkey"),"10001");
            imgUrl = getPin(pubkeyMap);
            if (imgUrl != null) {
                loginParams.setPcid(pubkeyMap.get("pcid"));
                loginParams.setNonce(pubkeyMap.get("nonce"));
                loginParams.setServertime(pubkeyMap.get("servertime"));
                loginParams.setRsakv(pubkeyMap.get("rsakv"));
                loginParams.setImgUrl(imgUrl);
                loginParams.setSp(sp);
                return loginParams;
            }
        } catch (IOException e) {
            // TODO Auto-generated catch block
            e.printStackTrace();
        }

        HashMap<String, String> ticketMap = null;
        try {
            ticketMap = getTicket(base64UserCount, sp, pubkeyMap);
        } catch (Exception e1) {
            // TODO Auto-generated catch block
            e1.printStackTrace();
        }

        //确认在最终登陆后是否再需要验证码(账号为新浪的注册邮箱)
        String vcUrl = isHasPinAgain(pubkeyMap, ticketMap);
        if (vcUrl != null) {
            loginParams.setPcid(pubkeyMap.get("pcid"));
            loginParams.setNonce(pubkeyMap.get("nonce"));
            loginParams.setServertime(pubkeyMap.get("servertime"));
            loginParams.setRsakv(pubkeyMap.get("rsakv"));
            loginParams.setImgUrl(imgUrl);
            loginParams.setSp(sp);
            return loginParams;
        }

        try {
            String code = authorize(ticketMap.get("ticket"), properties.getProperty("authorizeURL"),
                    properties.getProperty("redirect_URI"), properties.getProperty("client_ID"),
                    username, ticketMap.get("uid"));

            loginParams.setCode(code);
        } catch (KeyManagementException e) {
            // TODO Auto-generated catch block
            e.printStackTrace();
        } catch (NoSuchAlgorithmException e) {
            // TODO Auto-generated catch block
            e.printStackTrace();
        } catch (IOException e) {
            // TODO Auto-generated catch block
            e.printStackTrace();
        }
        return loginParams;

    }

    /**
     * 有验证码时登陆
     * @param sp
     * @param pin
     * @param pcid
     * @param servertime
     * @param nonce
     * @param rsakv
     * @return
     */
    public LoginParams doLoginByPin(String username, String sp, String pin, String pcid,
            String servertime,String nonce,String rsakv ) {

        Properties properties = initProperties();
        String base64UserCount = WeiboEncoder.encodeAccount(username);
        HashMap<String, String> ticketMap = null;
        LoginParams params = new LoginParams();
        try {
            ticketMap = getTicket(base64UserCount, sp, pin, pcid,
                    servertime, nonce, rsakv);
            if (ticketMap.containsKey("reason")) {
                //意为"输入的验证码不正确"
                String reply = "\\u8f93\\u5165\\u7684\\u9a8c\\u8bc1\\u7801\\u4e0d\\u6b63\\u786e";
                String reasonStr = ticketMap.get("reason");
                if (reasonStr.equals(reply)) {
                    params.setLogin(false);
                    return params;
                }
            }
            String code = authorize(ticketMap.get("ticket"), properties.getProperty("authorizeURL"),
                    properties.getProperty("redirect_URI"), properties.getProperty("client_ID"),
                    username, ticketMap.get("uid"));
            params.setCode(code);
        } catch (Exception e) {
            // TODO Auto-generated catch block
            e.printStackTrace();
        }

        return params;
    }

    /**
     * 模拟新浪授权
     * @param ticket ticket参数
     * @param redirectURI 回调地址
     * @param clientId appKey
     * @param username 用户名
     * @return token
     * @throws IOException
     * @throws KeyManagementException
     * @throws NoSuchAlgorithmException
     */
    private String authorize(String ticket, String authorizeURL, String redirectURI,
            String clientId, String username, String uid) throws IOException,
            KeyManagementException, NoSuchAlgorithmException {

        String code = null;
        String url = authorizeURL + "?client_id=" + clientId + "&redirect_uri="
                + redirectURI + "&response_type=code&forcelogin=true";
        String regCallback = authorizeURL + "?client_id=" + clientId + "&redirect_uri="
                + redirectURI + "&response_type=code&display=default&from=&with_cookie=";
        PostMethod post = new PostMethod(authorizeURL);
        //模拟申请token的链接,如果不添加,那么回调地址返回则为空
        post.setRequestHeader("Referer",url);
        // 模拟登录时所要提交的参数信息
        NameValuePair[] formpPairs=new NameValuePair[]{
                new NameValuePair("action", "login"),
                new NameValuePair("userId",username),
                new NameValuePair("ticket", ticket),
                new NameValuePair("response_type", "code"),
                new NameValuePair("redirect_uri", redirectURI),
                new NameValuePair("client_id", clientId),
                new NameValuePair("regCallback", URLEncoder.encode(regCallback, "UTF-8"))
                };
        post.setRequestBody(formpPairs);
        int status = httpClient.executeMethod(post);
        if (status == HttpStatus.SC_OK) {
            byte[] htmlDatas = post.getResponseBody();
            code = authorizeAgain(htmlDatas, ticket, authorizeURL,
                    redirectURI, clientId, username, uid);
        }else if (status == 302) {
            Header locationHeader = post.getResponseHeader("location");
            String location = locationHeader.getValue();
            code = location.substring(location.indexOf("=")+1);
        }        

        return code;
    }

    /**
     * 二次提交授权申请
     * @param htmlDatas 第一次授权申请返回的页面数据
     * @return
     * @throws IOException
     * @throws HttpException
     */
    private String authorizeAgain(byte[] htmlDatas, String ticket, String authorizeURL,
            String redirectURI,String clientId, String username,
            String uid) throws HttpException, IOException {

        String verifyToken = null;
        String html = new String(htmlDatas, "utf-8");
        Document doc = Jsoup.parse(html);
        Element verifyTokeneElement = doc.select("input[name=verifyToken]").first();
        verifyToken = verifyTokeneElement.attr("value");
        String code = null;
        String url = authorizeURL + "?client_id=" + clientId + "&redirect_uri="
                + redirectURI + "&response_type=code&forcelogin=true";
        String regCallback = authorizeURL + "?client_id=" + clientId + "&redirect_uri="
                + redirectURI + "&response_type=code&display=default&from=&with_cookie=";
        PostMethod post = new PostMethod(authorizeURL);
        //模拟申请token的链接,如果不添加,那么回调地址返回则为空
        post.setRequestHeader("Referer",authorizeURL);
        // 模拟登录时所要提交的参数信息
        NameValuePair[] formpPairs=new NameValuePair[]{
                new NameValuePair("action", "authorize"),
                new NameValuePair("uid",uid),
                new NameValuePair("url", url),
                new NameValuePair("response_type", "code"),
                new NameValuePair("redirect_uri", redirectURI),
                new NameValuePair("client_id", clientId),
                new NameValuePair("verifyToken", verifyToken),
                new NameValuePair("regCallback", URLEncoder.encode(regCallback, "UTF-8"))
                };
        post.setRequestBody(formpPairs);
        int status = httpClient.executeMethod(post);
        if (status == 302) {
            Header locationHeader = post.getResponseHeader("location");
            String location = locationHeader.getValue();
            if (location == null) {
                throw new NullPointerException("redirect_uri is null");
            }
            code = location.substring(location.indexOf("=")+1);
        }
        return code;
    }

    /**
     * 模拟用户预登录
     * @param unameBase64
     * @return
     * @throws IOException
     */
    private HashMap<String, String> pubKeyMap(String unameBase64)
            throws IOException {

        String url = "https://login.sina.com.cn/sso/prelogin.php?"
                + "entry=openapi&"
                + "callback=sinaSSOController.preloginCallBack&" + "su="
                + unameBase64 + "&" + "rsakt=mod&" + "checkpin=1&"
                + "client=ssologin.js(v1.4.5)" + "&_=" + new Date().getTime();
        return getParaFromResult(get(url));
    }

    /**
     * 预登陆是否需要验证码
     * @param pubkeyMap
     * @return
     */
    private String getPin(HashMap<String, String> pubkeyMap) {

        String imgUrl = null;
        int isShowpin = 0;
        if (pubkeyMap != null) {
            String showpin = pubkeyMap.get("showpin");
            if (showpin != null) {
                isShowpin = Integer.parseInt(showpin);
                if (isShowpin == 1) {
                    String url = "https://login.sina.com.cn/cgi/pin.php?"
                            + "r=" + Math.floor(Math.random() * 100000000)
                            + "&s=0"
                            + "&p=" + pubkeyMap.get("pcid"); 

                    imgUrl = url;
                }
            }
        }
        return imgUrl;
    }

    /**
     * 确认登陆后是否需要再验证
     * @return
     */
    private String isHasPinAgain(HashMap<String, String> pubkeyMap,
            HashMap<String, String> ticketMap) {

        String imgUrl = null;
        int isHasPin = 0;
        if ((pubkeyMap != null) && (ticketMap != null)) {
            //意为"为了您的帐号安全,请输入验证码"
            String str = "\\u4e3a\\u4e86\\u60a8\\u7684\\u5e10\\u53f7\\u5b89" +
                    "\\u5168\\uff0c\\u8bf7\\u8f93\\u5165\\u9a8c\\u8bc1\\u7801";

            if (ticketMap.containsKey("reason")) {
                String reasonStr = ticketMap.get("reason");
                if (reasonStr.equals(str)) {
                    isHasPin = 1;
                    String url = "https://login.sina.com.cn/cgi/pin.php?"
                            + "r=" + Math.floor(Math.random() * 100000000)
                            + "&s=0"
                            + "&p=" + pubkeyMap.get("pcid");

                    imgUrl = url;
                }
            }
        }
        return imgUrl;
    }

    /**
     * 获取验证码
     */
    public String getVCode(String pcid) {

        String imgUrl = null;
        if (pcid != null) {
            String url = "https://login.sina.com.cn/cgi/pin.php?"
                    + "r=" + Math.floor(Math.random() * 100000000)
                    + "&s=0"
                    + "&p=" + pcid;

            imgUrl = url;
        }
        return imgUrl;
    }

    /**
     * 保存验证码
     * @param url 验证码链接
     */
    public void saveVCodeImg(String url) {

        GetMethod getImages = new GetMethod(url);
        try {
            int status = httpClient.executeMethod(getImages);
            if (status == HttpStatus.SC_OK) {
                FileOutputStream outputStream = new FileOutputStream("vc.jpg");
                outputStream.write(getImages.getResponseBody());
                outputStream.close();
            }
        } catch (HttpException e) {
            // TODO Auto-generated catch block
            e.printStackTrace();
        } catch (IOException e) {
            // TODO Auto-generated catch block
            e.printStackTrace();
        }

    }

    /**
     * 无验证码时模拟用户登录,并获取ticket
     * @param usernameBase64 使用Base64加密的用户名
     * @param sp 使用SHA1加密后的用户密码
     * @return
     * @throws Exception
     */
    private HashMap<String, String> getTicket(String usernameBase64,
            String sp, HashMap<String, String> pubkeyMap) throws Exception {
        String url = null;
        if (pubkeyMap != null) {
            url = "https://login.sina.com.cn/sso/login.php?"
                    + "entry=openapi&"
                    + "gateway=1&"
                    + "from=&"
                    + "savestate=0&"
                    + "useticket=1&"
                    + "pagerefer=&"
                    + "ct=1800&"
                    + "s=1&"
                    + "vsnf=1&"
                    + "vsnval=&"
                    + "door=&"
                    + "su="+ usernameBase64
                    + "&"
                    + "service=miniblog&"
                    + "servertime="+ pubkeyMap.get("servertime")
                    + "&"
                    + "nonce="+ pubkeyMap.get("nonce")
                    + "&"
                    + "pwencode=rsa&"
                    + "rsakv="+ pubkeyMap.get("rsakv")
                    + "&"
                    + "sp="+ sp
                    + "&"
                    + "encoding=UTF-8&"
                    + "callback=sinaSSOController.loginCallBack&"
                    + "cdult=2&"
                    + "domain=weibo.com&"
                    + "prelt=37&"
                    + "returntype=TEXT&"
                    + "client=ssologin.js(v1.4.5)&" + "_=" + new Date().getTime();

        }
        return getParaFromResult(get(url));
    }

    /**
     * 有验证码时模拟用户登录,并获取ticket
     * @param usernameBase64
     * @param sp
     * @param pin
     * @param pcid
     * @param servertime
     * @param nonce
     * @param rsakv
     * @return
     * @throws Exception
     */
    public HashMap<String, String> getTicket(String usernameBase64, String sp, String pin,
            String pcid, String servertime,String nonce,String rsakv) throws Exception {

        String url = "https://login.sina.com.cn/sso/login.php?"
                        + "entry=openapi&"
                        + "gateway=1&"
                        + "from=&"
                        + "savestate=0&"
                        + "useticket=1&"
                        + "pagerefer=&"
                        + "pcid=" + pcid + "&"
                        + "ct=1800&"
                        + "s=1&"
                        + "vsnf=1&"
                        + "vsnval=&"
                        + "door=" + pin + "&"
                        + "su="+ usernameBase64
                        + "&"
                        + "service=miniblog&"
                        + "servertime="+ servertime
                        + "&"
                        + "nonce="+ nonce
                        + "&"
                        + "pwencode=rsa&"
                        + "rsakv="+ rsakv
                        + "&"
                        + "sp="+ sp
                        + "&"
                        + "encoding=UTF-8&"
                        + "callback=sinaSSOController.loginCallBack&"
                        + "cdult=2&"
                        + "domain=weibo.com&"
                        + "prelt=37&"
                        + "returntype=TEXT&"
                        + "client=ssologin.js(v1.4.5)&" + "_=" + new Date().getTime();

        return getParaFromResult(get(url));
    }

    /**
     * 分析结果,取出所需参数
     * @param result 页面内容
     * @return
     */
    private HashMap<String, String> getParaFromResult(String result) {

        HashMap<String, String> hm = new HashMap<String, String>();
        result = result.substring(result.indexOf("{") + 1, result.indexOf("}"));
        String[] r = result.split(",");
        String[] temp;
        for (int i = 0; i < r.length; i++) {
            temp = r[i].split(":");
            for (int j = 0; j < 2; j++) {
                if (temp[j].contains("\""))
                    temp[j] = temp[j].substring(1, temp[j].length() - 1);
            }
            hm.put(temp[0], temp[1]);
        }
        return hm;
    }

    /**
     * 执行给定的URL,并输出目标URL返回的页面结果
     * @param url
     * @return
     * @throws IOException
     */
    private String get(String url) throws IOException {

        String surl = null;
        GetMethod getMethod = new GetMethod(url);
        int status = httpClient.executeMethod(getMethod);
        if (status == HttpStatus.SC_OK) {
            surl = new String(getMethod.getResponseBody(), "UTF-8");
        }
        getMethod.releaseConnection();
        return surl;
    }

    /**
     * 配置信息初始化
     * @return
     */
    private Properties initProperties() {

        Properties prop = new Properties();
        try {
            prop.load(Thread.currentThread().getContextClassLoader().
                    getResourceAsStream("config.properties"));

        } catch (IOException e) {
            // TODO Auto-generated catch block
            e.printStackTrace();
        }
        return prop;
    }

    /**
     * @param args
     */
    public static void main(String[] args) {

        WeiboLoginer loginer = new WeiboLoginer();
        LoginParams loginParams = loginer.doLogin("","");
            //有验证码时
        if (loginParams.getCode() == null) {
            String pcid = loginParams.getPcid();
            String nonce = loginParams.getNonce();
            String rsakv = loginParams.getRsakv();
            String servertime = loginParams.getServertime();
            String sp = loginParams.getSp();

            System.err.println(loginParams.getImgUrl());
            //再次获取验证码
            System.err.println(loginer.getVCode(pcid));

            Scanner input = new Scanner(System.in);
            String pin = input.nextLine();

            LoginParams loginResult = loginer.doLoginByPin("",sp, pin, pcid, servertime, nonce, rsakv);
            if (!loginResult.isLogin()) {
                System.err.println("验证码错误!重新录入");

                //获取验证码并保存(测试)
                String imgUrl = loginer.getVCode(pcid);
                loginer.saveVCodeImg(imgUrl);

                Scanner input1= new Scanner(System.in);
                String pin1 = input1.nextLine();

                String code = loginer.doLoginByPin("",sp, pin1, pcid, servertime, nonce, rsakv).getCode();
                System.out.println(SinaWeiboOAuth.getToken(code));
            }

        }else {
            //无验证码时
            String code = loginParams.getCode();
            System.out.println(SinaWeiboOAuth.getToken(code));
        }
    }
}

参考地址 http://www.cnblogs.com/zhengbing/p/3459249.html

时间: 2024-10-07 18:17:09

新浪微博模拟登陆+数据抓取(java实现)的相关文章

腾讯微博模拟登陆+数据抓取(java实现)

不多说,贴出相关代码. 参数实体: package token.def; import java.io.Serializable; import java.util.Properties; public class TLoginParams implements Serializable { private static final long serialVersionUID = 6120319409538285515L; private String saltUin; private Stri

模拟登陆+数据爬取 (python+selenuim)

以下代码是用来爬取LinkedIn网站一些学者的经历的,仅供参考,注意:不要一次性大量爬取会被封号,不要问我为什么知道 #-*- coding:utf-8 -*- from selenium import webdriver from selenium.webdriver.common.keys import Keys import time from bs4 import BeautifulSoup diver=webdriver.Chrome() diver.get('https://www

[python]利用selenium模拟用户操作抓取天猫评论数据

准备: python3.5 安装selenium包 第一种方法: cmd里输pip install selenium,但是经常报错 第二种方法: 下载安装包-cmd进入解压路径-python setup.py install-报错permission denied-右键安全更改报错文件夹权限为完全控制-再次安装成功unknown error: unable to discover open pages-下载chromedriver放在环境变量目录下测试自动打开百度时提示"您使用的是不受支持的命令

大数据抓取采集框架(摘抄至http://blog.jobbole.com/46673/)

摘抄至http://blog.jobbole.com/46673/ 随着BIG DATA大数据概念逐渐升温,如何搭建一个能够采集海量数据的架构体系摆在大家眼前.如何能够做到所见即所得的无阻拦式采集.如何快速把不规则页面结构化并存储.如何满足越来越多的数据采集还要在有限时间内采集.这篇文章结合我们自身项目经验谈一下. 我们来看一下作为人是怎么获取网页数据的呢? 1.打开浏览器,输入网址url访问页面内容.2.复制页面内容的标题.作者.内容.3.存储到文本文件或者excel. 从技术角度来说整个过程

浅谈数据抓取的几种方法

在下抓数据也小有研究,现分享几个自己研究出来的抓数据的技术,可能会有很多不足的地方,欢迎大家指正补充哈哈! 方法一:直接抓取网页源码优点:速度快.缺点:1,正由于速度快,易被服务器端检测,可能会限制当前ip的抓取.对于这点,可以尝试使用ip代码解决.   2,如果你要抓取的数据,是在网页加载完后,js修改了网页元素,无法抓取.   3,遇到抓取一些大型网站,如果需要抓取如登录后的页面,可能需要破解服务器端帐号加密算法以及各种加密算法,及其考验技术性.适用场景:网页完全静态化,并且你要抓取的数据在

Ajax异步数据抓取

1.简介 1 有时候我们在用requests抓取页面的时候,得到的结果可能和在浏览器中看到的不一样,在浏览 2 器中可以看到正常显示的页面数据,但是使用requests得到的结果并没有.这是因为requests获取的 3 都是原始的HTML文档,而浏览器中的页面则是经过JavaScript处理数据后生成的结果,这些数据的 4 来源有多种,可能是通过ajax加载的,可能是包含在HTML文档中的,也可能是经过JavaScript和特 5 定算法计算后生成的. 6 对于第一种情况,数据加载是一种异步加

爬虫---selenium动态网页数据抓取

动态网页数据抓取 什么是AJAX: AJAX(Asynchronouse JavaScript And XML)异步JavaScript和XML.过在后台与服务器进行少量数据交换,Ajax 可以使网页实现异步更新.这意味着可以在不重新加载整个网页的情况下,对网页的某部分进行更新.传统的网页(不使用Ajax)如果需要更新内容,必须重载整个网页页面.因为传统的在传输数据格式方面,使用的是XML语法.因此叫做AJAX,其实现在数据交互基本上都是使用JSON.使用AJAX加载的数据,即使使用了JS,将数

第四章爬虫进阶之动态网页数据抓取

动态网页数据抓取 什么是AJAX: AJAX(Asynchronouse JavaScript And XML)异步JavaScript和XML.过在后台与服务器进行少量数据交换,Ajax 可以使网页实现异步更新.这意味着可以在不重新加载整个网页的情况下,对网页的某部分进行更新.传统的网页(不使用Ajax)如果需要更新内容,必须重载整个网页页面.因为传统的在传输数据格式方面,使用的是XML语法.因此叫做AJAX,其实现在数据交互基本上都是使用JSON.使用AJAX加载的数据,即使使用了JS,将数

数据抓取的艺术(三):抓取Google数据之心得

本来是想把这部分内容放到前一篇<数据抓取的艺术(二):数据抓取程序优化>之中.但是随着任务的完成,我越来越感觉到其中深深的趣味,现总结如下: (1)时间     时间是一个与抓取规模相形而生的因素,数据规模越大,时间消耗往往越长.所以程序优化变得相当重要,要知道抓取时间越长,出错的可能性就越大,这还不说程序需要人工干预的情境.一旦运行中需要人工干预,时间越长,干预次数越多,出错的几率就更大了.在数据太多,工期太短的情况下,使用多线程抓取,也是一个好办法,但这会增加程序复杂度,对最终数据准确性产