新浪微博数据挖掘食谱之十: 元素篇 (提取转发微博的元素)

#!/usr/bin/python
# -*- coding: utf-8 -*-

'''
Created on 2015-1-6
@author: beyondzhou
@name: extract_repost_attributions.py
'''

# Extract repost attributions
def extract_repost_attributions():

    # import
    from search import weibo_search
    from entities import weibo_entities
    from login import weibo_login
    from statuses import fetch_repost_timeline, fetch_weibo_status, get_rt_attributions
    import json

    # Access to sina api
    weibo_api = weibo_login()

    # Do the search
    subject = weibo_search(topic='iphone')

    # Decode entities
    (mids, names, texts, dates, reposts, comments, likes) = weibo_entities(subject)

    # Find weibo id whose repost number is above then 1
    for index in range(len(reposts)):
        if reposts[index] > 0:
            weibo_id_reposted = mids[index]
            print 'reposts number: %s, weibo_id_reposted: %s' % (reposts[index], weibo_id_reposted)
            break

    # Find repost timeline
    repost_timeline = fetch_repost_timeline(weibo_api, count = 200, page = 1, weibo_id = weibo_id_reposted)

    # Extract repost attribution (use the first record of repost_timeline to do the example)
    repost_attributions = get_rt_attributions(repost_timeline[0])

    # Output repost weibo
    repost_weibo = fetch_weibo_status(weibo_api, weibo_id = weibo_id_reposted)
    print json.dumps(repost_weibo, indent=1)
    print 'Output repost weibo done!\n'

    # Output repost attribution
    for repost_att in repost_attributions:
        print repost_att
    print 'Output repost attribution done!\n'

if __name__ == '__main__':
    extract_repost_attributions()
# Get repost repost weibo timeline
def fetch_repost_timeline(weibo_api, count = 200, page = 1, weibo_id = 1):

    repost_timeline = weibo_api.statuses.repost_timeline.get(count=count, page=page, id = weibo_id)
    statuses = repost_timeline['reposts']
    return statuses
# Get weibo status
def fetch_weibo_status(weibo_api, weibo_id = 1):

    weibo_status = weibo_api.statuses.show.get(id = weibo_id)
    return weibo_status
# get repost attributions
def get_rt_attributions(repost):
    import re

    # Regex adapted from Stack Overflow (http://bit.ly/1821y0J)
    rt_patterns = re.compile(ur"(RT|via|\u8f6c\u53d1)((?:\b\W*@\w+)+)", re.IGNORECASE)
    rt_attributions = []

    # Inspect the tweet to see if it was produced with /statuses/retweet/:id.
    # See https://dev.twitter.com/docs/api/1.1/get/statuses/retweets/%3Aid.
    if repost.has_key('retweeted_status'):
        attribution = repost['retweeted_status']['user']['screen_name'].lower()
        rt_attributions.append(attribution)

    # Also, inspect the tweet for the presence of "legacy" retweet patterns
    # such as "RT" and "via", which are still widely used for various reasons
    # and potentially very useful. See https://dev.twitter.com/discussions/2847
    # and https://dev.twitter.com/discussions/1748 for some details on how/why.
    '''
    subject = 'RT @SocialWebMining'
    import re
    rt_patterns = re.compile(r"(RT|via)((?:\b\W*@\w+)+)", re.IGNORECASE)
    rt_patterns.findall(subject)[0][1]
    Out[29]: ' @SocialWebMining'
    rt_patterns.findall(subject)
    Out[30]: [('RT', ' @SocialWebMining')]

    rt_patterns.findall(subject)[0]
    Out[31]: ('RT', ' @SocialWebMining')

    rt_patterns.findall(subject)[0][1]
    Out[32]: ' @SocialWebMining'

    In [32]: repost_attributions = '\u798f\u5229\u6d3e\u9001\u673a'

    In [33]: repost_attributions.decode("unicode_escape")
    Out[33]: u'\u798f\u5229\u6d3e\u9001\u673a'

    In [34]: print repost_attributions.decode("unicode_escape")
              福利派送机
    '''

    try:
        rt_attributions += [
            mention.strip()
                for mention in rt_patterns.findall(repost['text'])[0][1].split()
        ]
    except IndexError, _:
        pass

    # Filter out any duplicates
    return list(set([rta.strip("@").lower() for rta in rt_attributions]))

Result:

callback_url: https://api.weibo.com/oauth2/authorize?redirect_uri=http%3A//apps.weibo.com/guaguastd&response_type=code&client_id=2925245021
return_redirect_uri: http://weibo.com/login.php?url=http%3A%2F%2Fapps.weibo.com%2Fguaguastd%3Fcode%3D9d0a0ecb4df4db1d8d1a6ef5460c5e82
code: ['9d0a0ecb4df4db1d8d1a6ef5460c5e82']
now_handle: ce2b7c50-9531-11e4-b8c2-7bd88716b5dd
http://passport.weibo.com/
all_handles: [u'ce2b7c50-9531-11e4-b8c2-7bd88716b5dd', u'd3ba1000-9531-11e4-b8c2-7bd88716b5dd']
search done!
mids entities done!
names entities done!
texts entities done!
dates entities done!
reposts entities done!
comments entities done!
likes entities done!
reposts number: 6964, weibo_id_reposted: 3795801400243898
{
 "reposts_count": 6975,
 "truncated": false,
 "text": "1 toy 1 day\uff0c\u7b2c178\u671f\uff1a\u7f8e\u56fdBluelounge\uff0diPhone 5/5s\u6700\u4f73\u89c2\u770b\u89d2\u5ea6\u5145\u7535\u57fa\u5ea7\u3002\u624b\u673a\u653e\u5728\u684c\u4e0a\u5145\u7535\uff0c\u60f3\u770b\u4e00\u4e9b\u4e1c\u897f\uff0c\u611f\u89c9\u603b\u662f\u4e0d\u8212\u670d\u3002\u6709\u4e86\u5b83\uff0c\u4e0d\u4ec5\u5916\u89c2\u9ad8\u5927\u4e0a\uff0c\u8fd8\u8ba9\u4f60\u6709\u4e2a\u66f4\u597d\u7684\u89c2\u770b\u89d2\u5ea6\uff0c\u5145\u7535\u65f6\u7528\u8d77\u6765\u4e5f\u662f\u90a3\u4e48\u987a\u7545\u81ea\u5982\uff08\u8fd9\u662f\u6211\u9001\u51fa\u7684\u7b2c2232\u4ef6\u793c\u7269\uff0c\u5173\u8f6c\uff0c1\u67086\u65e5\u62bd\uff0c\u4e0d\u52301\u5929\uff0c\u5bf9\uff0c\u53ea\u5728\u7231\u8d34\uff0c\u56e0\u4e3a\u6709\u4f60\uff01\uff09",
 "visible": {
  "type": 0,
  "list_id": 0
 },
 "in_reply_to_status_id": "",
 "bmiddle_pic": "http://ww1.sinaimg.cn/bmiddle/005wRYdajw1enz2uspb4xj313y0pgmza.jpg",
 "id": 3795801400243898,
 "thumbnail_pic": "http://ww1.sinaimg.cn/thumbnail/005wRYdajw1enz2uspb4xj313y0pgmza.jpg",
 "mid": "3795801400243898",
 "source": "<a href=\"http://weibo.com/\" rel=\"nofollow\">\u5fae\u535a weibo.com</a>",
 "attitudes_count": 187,
 "in_reply_to_screen_name": "",
 "pic_urls": [
  {
   "thumbnail_pic": "http://ww1.sinaimg.cn/thumbnail/005wRYdajw1enz2uspb4xj313y0pgmza.jpg"
  },
  {
   "thumbnail_pic": "http://ww2.sinaimg.cn/thumbnail/005wRYdajw1enz2p7nulhj30go0egmxu.jpg"
  },
  {
   "thumbnail_pic": "http://ww3.sinaimg.cn/thumbnail/005wRYdajw1enz2pa8b0fj30i20i20uj.jpg"
  },
  {
   "thumbnail_pic": "http://ww3.sinaimg.cn/thumbnail/005wRYdajw1enz2pd1z82j312w12w40j.jpg"
  },
  {
   "thumbnail_pic": "http://ww3.sinaimg.cn/thumbnail/005wRYdajw1enz2us8x7vj31420mytaf.jpg"
  },
  {
   "thumbnail_pic": "http://ww3.sinaimg.cn/thumbnail/005wRYdajw1enz2utiqwqj30r00n2dgk.jpg"
  },
  {
   "thumbnail_pic": "http://ww3.sinaimg.cn/thumbnail/005wRYdajw1enz2uu0mcuj30us0ps76b.jpg"
  },
  {
   "thumbnail_pic": "http://ww2.sinaimg.cn/thumbnail/005wRYdajw1enz2vbtjkxj30lo0c7q53.jpg"
  },
  {
   "thumbnail_pic": "http://ww3.sinaimg.cn/thumbnail/005wRYdajw1enz2vmki74j30sg0ilab2.jpg"
  }
 ],
 "in_reply_to_user_id": "",
 "darwin_tags": [],
 "favorited": false,
 "original_pic": "http://ww1.sinaimg.cn/large/005wRYdajw1enz2uspb4xj313y0pgmza.jpg",
 "idstr": "3795801400243898",
 "source_type": 1,
 "user": {
  "cover_image": "http://ww4.sinaimg.cn/crop.0.0.920.300/005wRYdajw1emok192jcyj30pk08cgoi.jpg",
  "bi_followers_count": 3,
  "domain": "",
  "avatar_large": "http://tp1.sinaimg.cn/5066369752/180/5712388302/1",
  "verified_source": "",
  "ptype": 0,
  "cover_image_phone": "http://ww2.sinaimg.cn/crop.0.0.0.0/005wRYdajw1emovpmsh52j30hs0hrwhh.jpg",
  "statuses_count": 12132,
  "id": 5066369752,
  "verified_reason_url": "",
  "city": "1000",
  "verified": true,
  "friends_count": 4,
  "verified_reason_modified": "",
  "credit_score": 80,
  "block_app": 1,
  "follow_me": false,
  "verified_reason": "\u5317\u4eac\u7231\u8d34\u8fbe\u4eba\u7f51\u7edc\u6280\u672f\u6709\u9650\u516c\u53f8",
  "followers_count": 634775,
  "location": "\u5317\u4eac",
  "verified_state": 0,
  "verified_trade": "",
  "mbtype": 12,
  "verified_source_url": "",
  "profile_url": "u/5066369752",
  "block_word": 0,
  "avatar_hd": "http://ww1.sinaimg.cn/crop.0.0.943.943.1024/005wRYdajw1emu8osezk8j30q90q9jsp.jpg",
  "star": 0,
  "description": "\u6211\u9001\u7684\u4e0d\u662f\u793c\u7269\uff0c\u662f\u4efd\u5e0c\u671b\uff0c\u6bcf\u592910\u4e2a\uff0c\u5bf9\uff0c\u53ea\u5728\u7231\u8d34\uff01",
  "verified_contact_email": "[email protected]",
  "online_status": 0,
  "mbrank": 4,
  "verified_level": 3,
  "profile_image_url": "http://tp1.sinaimg.cn/5066369752/50/5712388302/1",
  "idstr": "5066369752",
  "verified_contact_mobile": "",
  "allow_all_act_msg": false,
  "allow_all_comment": true,
  "geo_enabled": true,
  "class": 1,
  "screen_name": "\u7231\u8d34",
  "lang": "zh-cn",
  "weihao": "",
  "remark": "",
  "favourites_count": 21,
  "name": "\u7231\u8d34",
  "url": "",
  "province": "11",
  "created_at": "Tue Mar 11 20:16:07 +0800 2014",
  "verified_contact_name": "Jason",
  "verified_type": 2,
  "gender": "m",
  "following": false,
  "pagefriends_count": 0,
  "urank": 15
 },
 "geo": null,
 "created_at": "Mon Jan 05 23:50:39 +0800 2015",
 "mlevel": 0,
 "comments_count": 745
}
Output repost weibo done!

爱贴
Output repost attribution done!
时间: 2024-08-10 17:42:02

新浪微博数据挖掘食谱之十: 元素篇 (提取转发微博的元素)的相关文章

新浪微博数据挖掘食谱之五: 保存篇 (json mongodb格式)

#!/usr/bin/python # -*- coding: utf-8 -*- ''' Created on 2015-1-1 @author: beyondzhou @name: json_data_mongodb.py ''' ''' Config windows service for mongodb Configure a windows service for MongoDb The following procedure assumes you have installed Mo

新浪微博数据挖掘食谱之四: 保存篇 (json text格式)

#!/usr/bin/python # -*- coding: utf-8 -*- ''' Created on 2014-12-31 @author: beyondzhou @name: json_data_text.py ''' # Get public timeline of sina weibo and save json response data into text file def json_data_text(): # import from login import weibo

第二十四篇 jQuery 学习6 删除元素

jQuery 学习6 删除元素 上节课我们做了添加元素,模拟的是楼主发的文章,路人评论,那么同学们这节课学了删除之后,去之前的代码上添加一个删除,模拟一个楼主删除路人的评论. jQuery的删除方法: remove() - 删除被选元素(及其子元素) empty() - 从被选元素中删除子元素 以上引用w3c教程 为了同学们更好的扩展,老师就不在上节课的基础上模拟删除,而是写出删除的功能代码和注意事项,同学们就自己多尝试,来瞧瞧代码先: <!DOCTYPE html> <html>

新浪微博数据挖掘食谱之六: 元素篇 (提取微博元素)

#!/usr/bin/python # -*- coding: utf-8 -*- ''' Created on 2015-1-2 @author: beyondzhou @name: extract_weibo_entities.py ''' # Extract entities of sina weibo def extract_weibo_entities(): # import from login import weibo_login from statuses import extr

新浪微博数据挖掘菜谱之三: 搜索篇 (selenium)

#!/usr/bin/python # -*- coding: utf-8 -*- ''' Created on 2014-12-30 @author: beyondzhou @name: decode_search_entities.py ''' # Decode entities from search result def decode_search_entities(): # import from search import weibo_search from entities imp

新浪微博数据挖掘菜谱之一: 登录篇 (API)

#!/usr/bin/python # -*- coding: utf-8 -*- ''' Created on 2014-12-28 @author: beyondzhou @name: login.py ''' import re, json import urllib, urllib2, urllib3, cookielib import base64, rsa, binascii # encrypt from weibo import APIClient class SmartRedir

Android新浪微博获取评论信息、发表评论、转发微博等

首先前面一节中说过,获取用户的微博信息,这里简单介绍下获取微博的评论信息,以及对微博进行评论,转发微博等. OAuth认证,这里就不多说了, 我说名一下接口: 获取微博的评论列表接口: http://api.t.sina.com.cn/statuses/comments.json 我们这里需要把微博ID做为参数请求,这个ID我们可以根据前面一节解析用户的微博信息得到. 对微博进行评论接口:http://api.t.sina.com.cn/statuses/comment.json 我们需要把微博

Python之路【第十九篇】:爬虫

Python之路[第十九篇]:爬虫 网络爬虫(又被称为网页蜘蛛,网络机器人,在FOAF社区中间,更经常的称为网页追逐者),是一种按照一定的规则,自动地抓取万维网信息的程序或者脚本.另外一些不常使用的名字还有蚂蚁.自动索引.模拟程序或者蠕虫. Requests Python标准库中提供了:urllib.urllib2.httplib等模块以供Http请求,但是,它的 API 太渣了.它是为另一个时代.另一个互联网所创建的.它需要巨量的工作,甚至包括各种方法覆盖,来完成最简单的任务. import

机器学习与数据挖掘中的十大经典算法

背景: top10算法的前期背景是吴教授在香港做了一个关于数据挖掘top10挑战的一个报告,会后有一名内地的教授提出了一个类似的想法.吴教授觉得非常好,开始着手解决这个事情.找了一系列的大牛(都是数据挖掘的大牛),都觉得想法很好,但是都不愿自己干.原因估计有一下几种:1.确实很忙2.得罪人3.一系列工作很繁琐等等.最后和明尼苏达大学的Vipin Kumar教授一起把这件事情承担下来.先是请数据挖掘领域获过kdd和icdm大奖的十四个牛人提名候选,其中一人因为确实很忙,正从ibm转行到微软,吴教授