***PHP多线程pthreads 实现QQ号码爬虫

通过空间历史浏览,爬出查看你空间的人(一般限制20人,除非开通黄钻),然后在爬出这20人的浏览记录,依次向下爬,你可以控制爬行深度。
这里仅仅给出怕中代码片段,你可以进一步优化,将QQ分类存储。通过QQ相互浏览关系,可以通过绘图工具绘制好友网络。等等

代码涉及pthreads 如果不清楚请阅读:《PHP 高级编程之多线程》
http://netkiller.github.io/journal/thread.php.html

<?php
/*
Homepage: http://netkiller.github.io
Author: Neo <[email protected]>
*/
if(!extension_loaded(‘pthreads‘)) die (‘Please install pthreads‘);

include_once(‘Snoopy.class.php‘);

class CrawlerWorker extends Worker {

    protected  static $dbh;
    public function __construct() {

    }
    public function run(){
    /*
        $dbhost = ‘db.example.com‘;         // 数据库服务器
        $dbuser = ‘example.com‘;            // 数据库用户名
        $dbpw = ‘password‘;                 // 数据库密码
        $dbname = ‘example‘;                // 数据库名

        self::$dbh  = new PDO("mysql:host=$dbhost;port=3306;dbname=$dbname", $dbuser, $dbpw, array(
            PDO::MYSQL_ATTR_INIT_COMMAND => ‘SET NAMES \‘UTF8\‘‘,
            PDO::MYSQL_ATTR_COMPRESS => true,
            PDO::ATTR_PERSISTENT => true
            )
        );
    */
    }
    protected function getInstance(){
        return self::$dbh;
    }

}

/* the collectable class implements machinery for Pool::collect */
class Crawler extends Stackable {
    public $depth = 3;
    private static $level = 0;
    public function __construct($qq) {
        $this->qq = $qq;
    }
    public function run() {

        try {
            $dbh  = $this->worker->getInstance();
            $this->recursion(array($this->qq));
        }
        catch(PDOException $e) {
            $error = sprintf("%s,%s\n", $mobile, $id );
            file_put_contents("mobile_error.log", $error, FILE_APPEND);
        }
        //printf("runtime: %s, %s\n", date(‘Y-m-d H:i:s‘), $this->worker->getThreadId());
        //$lst = $this->qzone($this->qq);
        //print_r($lst);
    }
    public function recursion($qqs){

        if( self::$level <= $this->depth){
            self::$level++;
        }else if(self::$level > 0){
            self::$level--;
        }
        printf("Level: %s\n", self::$level);
        //sleep(1);
        usleep(mt_rand(10000,1000000));
        if(self::$level >= $this->depth){
            return;
        }

        foreach($qqs as $uin) {
            $lst = $this->qzone($uin);
            print_r($lst);
            $this->recursion($lst);
        }
    }

    public function qzone($qq){
        $url = ‘http://m.qzone.com/mqz_get_visitor?g_tk=1191852101&res_mode=0&res_uin=‘.$qq.‘&offset=0&count=100&page=1&format=json&t=1401762986882&sid=dODKVcYv6azjN87cxXQ5mao1xgakYjHg18c8aa5e0201%3D%3D‘;
        $snoopy = new Snoopy;

        // need an proxy?
        //$snoopy->proxy_host = "my.proxy.host";
        //$snoopy->proxy_port = "8080";

        // set browser and referer:
        $snoopy->agent = "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1)";
        $snoopy->referer = "http://m.qzone.com/";

        // set some cookies:
        //$snoopy->cookies["SessionID"] = ‘238472834723489‘;
        //$snoopy->cookies["favoriteColor"] = "blue";

        // set an raw-header:
        $snoopy->rawheaders["Pragma"] = "no-cache";

        // set some internal variables:
        $snoopy->maxredirs = 2;
        $snoopy->offsiteok = false;
        $snoopy->expandlinks = false;

        // set username and password (optional)
        //$snoopy->user = "joe";
        //$snoopy->pass = "bloe";

        // fetch the text of the website www.google.com:
        if($snoopy->fetchtext($url)){
            // other methods: fetch, fetchform, fetchlinks, submittext and submitlinks

            // response code:
            //print "response code: ".$snoopy->response_code."<br/>\n";

            // print the headers:
            //print "<b>Headers:</b><br/>";
            //while(list($key,$val) = each($snoopy->headers)){
            //  print $key.": ".$val."<br/>\n";
            //}

            // print the texts of the website:
            //print_r( json_decode($snoopy->results) );

            $results = array();
            $tmp = json_decode($snoopy->results);

            if($tmp){
                if(property_exists($tmp, ‘data‘)){
                    foreach( $tmp->data->list as $lst ){
                        $results[] = $lst->uin;
                    }
                }
            }
            return ($results);

        }
        else {
            print "Snoopy: error while fetching document: ".$snoopy->error."\n";
        }
    }
}

$pool = new Pool(100, \CrawlerWorker::class, []);

#foreach (range(1000, 100000) as $number) {
#   $pool->submit(new Crawler($number));
#}

$pool->submit(new Crawler(‘13721218‘));
$pool->submit(new Crawler(‘291379‘));
//$pool->submit(new Crawler(‘xxx‘));
//$pool->submit(new Crawler(‘xxx‘));
//$pool->submit(new Crawler(‘xxx‘));
// 以此类推
//$pool->submit(new Crawler(‘nnn‘));

$pool->shutdown();
?>
时间: 2024-10-09 00:25:38

***PHP多线程pthreads 实现QQ号码爬虫的相关文章

QQ空间爬虫最新分享,一天 400 万条数据(附代码地址)

http://mp.weixin.qq.com/s?__biz=MzAxMjUyNDQ5OA==&mid=2653552228&idx=1&sn=e476bf23556406cbce7de65508d79843&chksm=806dd0d9b71a59cf2b062a19309c849a62ba15790898e5e619205f0f5ec84a90025a8cea05e9&mpshare=1&scene=23&srcid=11252MeE6Qu1D

精准QQ号码采集器 在线QQ号码采集器 地区QQ号码采集 空间访客提取器

软件:精准QQ号码采集器 [特色功能]按同城,按同乡,按男女,按在线,按年龄段,按关键字来采集精准的QQ号码功能一:空间动态提取器,提取空间的动态(空间动态)功能二:动态访客提取器,提取空间动态的浏览,赞的QQ号码功能三:日志地址提取器,提取空间日志的地址,用于访客提取功能四:日志访客提取器,提取日志中的访客,赞过的人的所有QQ号码功能五:相册访客提取器,提取相册中的访客,支持漫游提取,监控提取功能六:留言号码提取器,提取留言版中所有留言过的QQ号码,功能六:QQ空间附近发说说的号码采集功能八:

PHP获取QQ群成员QQ号码

1.加入某个群 2.进入群空间http://qun.qzone.qq.com/group#!/25998059/member 备注:25998059为群号码 3.进入群成员列表 4.使用浏览,在某个群成员头像上面右击->审查元素,打开如下图所示的html代码 5.如上图拷贝 6.提取qq号码,以PHP为例 核心代码: $fileStr = file_get_contents(‘qqnumber.txt’); $qqNumberA = array(); preg_match_all(‘/\((.*

【转】一个在内存里搜索QQ号码的源码,源自看雪论坛

#include <windows.h> #include <tlhelp32.h> #include <tchar.h> #include <stdlib.h> #include <stdio.h> int SearchStr(PTSTR pszString, int iStrLen, PTSTR pszSearchStr) { int i = 0; int iSearchStrlen = _tcslen(pszSearchStr); whil

调用WebService查看QQ号码状态

1.1.  webService服务接口 QQ在线状态 WEB 服务 Endpoint:http://webservice.webxml.com.cn/webservices/qqOnlineWebService.asmx Disco:http://webservice.webxml.com.cn/webservices/qqOnlineWebService.asmx?disco WSDL:http://webservice.webxml.com.cn/webservices/qqOnlineW

js正则表达式:验证邮箱格式、密码复杂度、手机号码、QQ号码

直接上代码 Java 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83

第2个自己写的Java,从txt里提取QQ号码

流+正则表达式 做了一个抓取文件里QQ号码的几行代码,居然写了这么久....书到用时方恨少,累死了 ~~~~(>_<)~~~~

验证QQ号码正则表达式

验证QQ号码正则表达式:下面分享一个非常简单的验证QQ号码的正则表达式.代码如下: [1-9][0-9]{4,} 原文地址是http://www.softwhy.com/forum.php?mod=viewthread&tid=18676 更多内容可以参阅http://www.softwhy.com/zhengze/

找女神要QQ号码

这两天微信订阅号中反复出现<找女神要QQ号码>(e.g.http://www.cnblogs.com/iforever/p/4584490.html),题目是这样的: 给了一串数字(不是QQ号码),根据下面规则可以找出QQ号码:首先删除第一个数,紧接着将第二个数放到这串数字的末尾,再将第三个数删除,并将第四个数放到这串数字的末尾......如此循环,知道剩下最后一个数,将最后一个数也删除,按照刚才删除的顺序,把这些数字连在一起就是女神的QQ号码啦. 我也来凑个热闹: -------------