python D:\pymine\clean\spider_map\get_bd_uid_rest_b.py python D:\pymine\clean\spider_map\get_bd_uid_rest.py python D:\pymine\clean\spider_map\get_bd_uid_28_other20_b.py #MAX_USED_TIMES = 1900 python D:\pymine\clean\spider_map\get_bd_uid_28_other20.py python D:\pymine\clean\spider_map\get_bd_uid.py python D:\pymine\clean\spider_map\get_bd_uid.py python D:\pymine\clean\spider_map\get_bd_uid.py
import xlrd import time import sys import os import requests import sqlite3 import threading curPath = os.path.abspath(os.path.dirname(__file__)) rootPath = os.path.split(curPath)[0] sys.path.append(rootPath) MAX_USED_TIMES, overrun_str, DB_KEY_EXHAUST = 1900, ‘天配额超限,限制访问‘, ‘DB_KEY_EXHAUST‘ db = ‘py_bdspider_status.db‘ db = ‘%s\\%s‘ % (curPath, db) pcity_list = [] pcity_file = ‘%s\\%s‘ % (curPath, ‘省会城市.txt‘) with open(pcity_file, ‘r‘, encoding=‘utf-8‘) as pf: c_ = 0 for i in pf: c_ += 1 if c_ == 3: c_ = 0 pcity_list.append(i.replace(‘ ‘, ‘‘).replace(‘\n‘, ‘‘) + ‘市‘) pcity_sorted_list = sorted(pcity_list) target_city_list_big = [‘广州市‘, ‘厦门市‘, ‘深圳市‘, ‘北京市‘, ‘杭州市‘, ‘成都市‘, ‘上海市‘, ‘西安市‘] target_city_list_pass = target_city_list_big for i in pcity_list: if i not in target_city_list_big: target_city_list_pass.append(i) # def db_init_key_table(): # conn = sqlite3.connect(db) # c = conn.cursor() # sql = ‘DELETE FROM baidu_map_key_used‘ # c.execute(sql) # conn.commit() # pcity_file = ‘%s\\%s‘ % (curPath, ‘bdmap_key.txt‘) # with open(pcity_file, ‘r‘, encoding=‘utf-8‘) as pf: # c_ = 0 # for i in pf: # if len(i) < 4: # continue # author, key = i.replace(‘\n‘, ‘‘).split(‘\t‘) # localtime_ = time.strftime("%y%m%d%H%M%S", time.localtime()) # sql = ‘INSERT INTO baidu_map_key_used (author,key,update_time,today_used) VALUES ("%s","%s","%s",%s) ‘ % ( # author, key, localtime_, 0) # c.execute(sql) # conn.commit() # conn.close() # db_init_key_table() # target_city_list = target_city_list[0:11] # target_city_list = target_city_list[0:11] def db_get_one_effective(): conn = sqlite3.connect(db) c = conn.cursor() sql = ‘SELECT key FROM baidu_map_key_used WHERE today_used<=%s ORDER BY today_used ASC‘ % (MAX_USED_TIMES) res = c.execute(sql).fetchone() if res is None: return DB_KEY_EXHAUST else: return res[0] conn.close def db_update_one_today_used(key): conn = sqlite3.connect(db) c = conn.cursor() localtime_ = time.strftime("%y%m%d%H%M%S", time.localtime()) sql = ‘UPDATE baidu_map_key_used SET today_used = today_used+1 ,update_time=%s WHERE key="%s" ‘ % ( localtime_, key) c.execute(sql) conn.commit() conn.close() dir_, dir_exception = ‘baidu_map_uid‘, ‘baidu_map_uid_exception‘ requested_file_list = [] requested_file_dir_str, requested_file_dir_exception_str = ‘%s\\%s\\‘ % (curPath, dir_), ‘%s\\%s\\‘ % ( curPath, dir_exception) requested_file_dir = os.listdir(requested_file_dir_str) def chk_if_requested_file(): for f in requested_file_dir: to_in = f.split(‘.txt‘)[0] if to_in not in requested_file_list: requested_file_list.append(to_in) chk_if_requested_file() def write_requested_res(request_name, str_, type_=‘.txt‘): fname = ‘%s%s%s‘ % (requested_file_dir_str, request_name, type_) # 上海市虹口区岳阳医院?.txt fname = fname.replace(‘?‘, ‘‘) with open(fname, ‘w‘, encoding=‘utf-8‘) as ft: ft.write(str_) print(‘ok‘, threading.get_ident(), request_name) def write_requested_exception_res(request_name, str_, type_=‘.txt‘): fname = ‘%s%s%s‘ % (requested_file_dir_exception_str, request_name, type_) # 上海市虹口区岳阳医院?.txt fname = fname.replace(‘?‘, ‘‘) with open(fname, ‘w‘, encoding=‘utf-8‘) as ft: ft.write(str_) request_dic = {} target_city_list = [] def gen_request_dic_list(): fname_source = ‘官方上传任务.csv_py170829093808-BD_request_name-REDUCTION170829142821‘ fname_open = ‘%s\\%s‘ % (curPath, fname_source) FEXCEL = ‘%s%s‘ % (fname_open, ‘.xlsx‘) data = xlrd.open_workbook(FEXCEL) table = data.sheets()[0] nrows, ncols = table.nrows, table.ncols for i in range(1, nrows): l = table.row_values(i) dbid, area_code, name_, request_name, type_, city, district, addr, street = l # if city not in target_city_list: # continue if city in target_city_list_pass: continue if city not in target_city_list: target_city_list.append(city) request_name_chk = ‘%s%s%s‘ % (city, district, request_name) if request_name_chk in requested_file_list: continue if city not in request_dic: request_dic[city] = {} if district not in request_dic[city]: request_dic[city][district] = {} request_dic[city][district] = [] if request_name not in request_dic[city][district]: request_dic[city][district].append(request_name) gen_request_dic_list() fname_source = ‘官方上传任务.csv_py170829093808-BD_request_name-REDUCTION170829142821‘ # http://api.map.baidu.com/place/v2/suggestion?query=瀛嘉天下®ion=重庆市&city_limit=true&output=json&ak=oy2Q7IluhhwTGlz6l8pXYv6a0m6hXxr1 base_url = ‘http://api.map.baidu.com/place/v2/suggestion?query=R-QUERY®ion=R-CITY&city_limit=true&output=json&ak=R-AK‘ def fun_(city): for district in request_dic[city]: for request_name in request_dic[city][district]: request_name_chk = ‘%s%s%s‘ % (city, district, request_name) chk_if_requested_file() if request_name_chk in requested_file_list: continue ak = db_get_one_effective() if ak == DB_KEY_EXHAUST: print(DB_KEY_EXHAUST) break else: url_ = base_url.replace(‘R-QUERY‘, request_name).replace(‘R-CITY‘, city).replace(‘R-AK‘, ak) try: bd_res_json_str = requests.get(url_).text db_update_one_today_used(ak) write_requested_res(request_name_chk, bd_res_json_str) except Exception: bd_res_json_str = ‘请求百度-异常‘ write_requested_exception_res(request_name_chk, bd_res_json_str) print(bd_res_json_str) class MyThread(threading.Thread): def __init__(self, func, args): threading.Thread.__init__(self) self.func, self.args = func, args def run(self): self.func(self.args) thread_sum = len(target_city_list) def main(): threads_list = [] for nloop in range(0, thread_sum, 1): city = target_city_list[nloop] thread_instance = MyThread(fun_, (city)) threads_list.append(thread_instance) for t in threads_list: t.setDaemon = False t.start() for t in threads_list: t.join() if __name__ == ‘__main__‘: main()
import xlrd import time import sys import os import requests import sqlite3 import threading curPath = os.path.abspath(os.path.dirname(__file__)) rootPath = os.path.split(curPath)[0] sys.path.append(rootPath) MAX_USED_TIMES, overrun_str, DB_KEY_EXHAUST = 1900, ‘天配额超限,限制访问‘, ‘DB_KEY_EXHAUST‘ db = ‘py_bdspider_status.db‘ db = ‘%s\\%s‘ % (curPath, db) pcity_list = [] pcity_file = ‘%s\\%s‘ % (curPath, ‘省会城市.txt‘) with open(pcity_file, ‘r‘, encoding=‘utf-8‘) as pf: c_ = 0 for i in pf: c_ += 1 if c_ == 3: c_ = 0 pcity_list.append(i.replace(‘ ‘, ‘‘).replace(‘\n‘, ‘‘) + ‘市‘) pcity_sorted_list = sorted(pcity_list) target_city_list_big = [‘广州市‘, ‘厦门市‘, ‘深圳市‘, ‘北京市‘, ‘杭州市‘, ‘成都市‘, ‘上海市‘, ‘西安市‘] target_city_list = [] for i in pcity_list: if i not in target_city_list_big: target_city_list.append(i) # def db_init_key_table(): # conn = sqlite3.connect(db) # c = conn.cursor() # sql = ‘DELETE FROM baidu_map_key_used‘ # c.execute(sql) # conn.commit() # pcity_file = ‘%s\\%s‘ % (curPath, ‘bdmap_key.txt‘) # with open(pcity_file, ‘r‘, encoding=‘utf-8‘) as pf: # c_ = 0 # for i in pf: # if len(i) < 4: # continue # author, key = i.replace(‘\n‘, ‘‘).split(‘\t‘) # localtime_ = time.strftime("%y%m%d%H%M%S", time.localtime()) # sql = ‘INSERT INTO baidu_map_key_used (author,key,update_time,today_used) VALUES ("%s","%s","%s",%s) ‘ % ( # author, key, localtime_, 0) # c.execute(sql) # conn.commit() # conn.close() # db_init_key_table() target_city_list = target_city_list[11:] def db_get_one_effective(): conn = sqlite3.connect(db) c = conn.cursor() sql = ‘SELECT key FROM baidu_map_key_used WHERE today_used<=%s ‘ % (MAX_USED_TIMES) res = c.execute(sql).fetchone() if res is None: return DB_KEY_EXHAUST else: return res[0] conn.close def db_update_one_today_used(key): conn = sqlite3.connect(db) c = conn.cursor() localtime_ = time.strftime("%y%m%d%H%M%S", time.localtime()) sql = ‘UPDATE baidu_map_key_used SET today_used = today_used+1 ,update_time=%s WHERE key="%s" ‘ % ( localtime_, key) c.execute(sql) conn.commit() conn.close() dir_, dir_exception = ‘baidu_map_uid‘, ‘baidu_map_uid_exception‘ requested_file_list = [] requested_file_dir_str, requested_file_dir_exception_str = ‘%s\\%s\\‘ % (curPath, dir_), ‘%s\\%s\\‘ % ( curPath, dir_exception) requested_file_dir = os.listdir(requested_file_dir_str) def chk_if_requested_file(): for f in requested_file_dir: to_in = f.split(‘.txt‘)[0] if to_in not in requested_file_list: requested_file_list.append(to_in) chk_if_requested_file() def write_requested_res(request_name, str_, type_=‘.txt‘): fname = ‘%s%s%s‘ % (requested_file_dir_str, request_name, type_) # 上海市虹口区岳阳医院?.txt fname = fname.replace(‘?‘, ‘‘) with open(fname, ‘w‘, encoding=‘utf-8‘) as ft: ft.write(str_) print(‘ok‘, threading.get_ident(), request_name) def write_requested_exception_res(request_name, str_, type_=‘.txt‘): fname = ‘%s%s%s‘ % (requested_file_dir_exception_str, request_name, type_) # 上海市虹口区岳阳医院?.txt fname = fname.replace(‘?‘, ‘‘) with open(fname, ‘w‘, encoding=‘utf-8‘) as ft: ft.write(str_) request_dic = {} def gen_request_dic_list(): fname_source = ‘官方上传任务.csv_py170829093808-BD_request_name-REDUCTION170829142821‘ fname_open = ‘%s\\%s‘ % (curPath, fname_source) FEXCEL = ‘%s%s‘ % (fname_open, ‘.xlsx‘) data = xlrd.open_workbook(FEXCEL) table = data.sheets()[0] nrows, ncols = table.nrows, table.ncols for i in range(1, nrows): l = table.row_values(i) dbid, area_code, name_, request_name, type_, city, district, addr, street = l if city not in target_city_list: continue request_name_chk = ‘%s%s%s‘ % (city, district, request_name) if request_name_chk in requested_file_list: continue if city not in request_dic: request_dic[city] = {} if district not in request_dic[city]: request_dic[city][district] = {} request_dic[city][district] = [] if request_name not in request_dic[city][district]: request_dic[city][district].append(request_name) gen_request_dic_list() fname_source = ‘官方上传任务.csv_py170829093808-BD_request_name-REDUCTION170829142821‘ # http://api.map.baidu.com/place/v2/suggestion?query=瀛嘉天下®ion=重庆市&city_limit=true&output=json&ak=oy2Q7IluhhwTGlz6l8pXYv6a0m6hXxr1 base_url = ‘http://api.map.baidu.com/place/v2/suggestion?query=R-QUERY®ion=R-CITY&city_limit=true&output=json&ak=R-AK‘ def fun_(city): for district in request_dic[city]: for request_name in request_dic[city][district]: request_name_chk = ‘%s%s%s‘ % (city, district, request_name) chk_if_requested_file() if request_name_chk in requested_file_list: continue ak = db_get_one_effective() if ak == DB_KEY_EXHAUST: print(DB_KEY_EXHAUST) break else: url_ = base_url.replace(‘R-QUERY‘, request_name).replace(‘R-CITY‘, city).replace(‘R-AK‘, ak) try: bd_res_json_str = requests.get(url_).text db_update_one_today_used(ak) write_requested_res(request_name_chk, bd_res_json_str) except Exception: bd_res_json_str = ‘请求百度-异常‘ write_requested_exception_res(request_name_chk, bd_res_json_str) print(bd_res_json_str) class MyThread(threading.Thread): def __init__(self, func, args): threading.Thread.__init__(self) self.func, self.args = func, args def run(self): self.func(self.args) thread_sum = len(target_city_list) def main(): threads_list = [] for nloop in range(0, thread_sum, 1): city = target_city_list[nloop] if city not in request_dic: continue thread_instance = MyThread(fun_, (city)) threads_list.append(thread_instance) for t in threads_list: t.setDaemon = False t.start() for t in threads_list: t.join() if __name__ == ‘__main__‘: main()
import xlrd import time import sys import os import requests import sqlite3 import threading curPath = os.path.abspath(os.path.dirname(__file__)) rootPath = os.path.split(curPath)[0] sys.path.append(rootPath) MAX_USED_TIMES, overrun_str, DB_KEY_EXHAUST = 1900, ‘天配额超限,限制访问‘, ‘DB_KEY_EXHAUST‘ db = ‘py_bdspider_status.db‘ db = ‘%s\\%s‘ % (curPath, db) pcity_list = [] pcity_file = ‘%s\\%s‘ % (curPath, ‘省会城市.txt‘) with open(pcity_file, ‘r‘, encoding=‘utf-8‘) as pf: c_ = 0 for i in pf: c_ += 1 if c_ == 3: c_ = 0 pcity_list.append(i.replace(‘ ‘, ‘‘).replace(‘\n‘, ‘‘) + ‘市‘) pcity_sorted_list = sorted(pcity_list) target_city_list_big = [‘广州市‘, ‘厦门市‘, ‘深圳市‘, ‘北京市‘, ‘杭州市‘, ‘成都市‘, ‘上海市‘, ‘西安市‘] target_city_list = [] for i in pcity_list: if i not in target_city_list_big: target_city_list.append(i) # def db_init_key_table(): # conn = sqlite3.connect(db) # c = conn.cursor() # sql = ‘DELETE FROM baidu_map_key_used‘ # c.execute(sql) # conn.commit() # pcity_file = ‘%s\\%s‘ % (curPath, ‘bdmap_key.txt‘) # with open(pcity_file, ‘r‘, encoding=‘utf-8‘) as pf: # c_ = 0 # for i in pf: # if len(i) < 4: # continue # author, key = i.replace(‘\n‘, ‘‘).split(‘\t‘) # localtime_ = time.strftime("%y%m%d%H%M%S", time.localtime()) # sql = ‘INSERT INTO baidu_map_key_used (author,key,update_time,today_used) VALUES ("%s","%s","%s",%s) ‘ % ( # author, key, localtime_, 0) # c.execute(sql) # conn.commit() # conn.close() # db_init_key_table() # target_city_list = target_city_list[0:11] # target_city_list = target_city_list[0:11] target_city_list =target_city_list[11:] def db_get_one_effective(): conn = sqlite3.connect(db) c = conn.cursor() sql = ‘SELECT key FROM baidu_map_key_used WHERE today_used<=%s ORDER BY today_used ASC‘ % (MAX_USED_TIMES) res = c.execute(sql).fetchone() if res is None: return DB_KEY_EXHAUST else: return res[0] conn.close def db_update_one_today_used(key): conn = sqlite3.connect(db) c = conn.cursor() localtime_ = time.strftime("%y%m%d%H%M%S", time.localtime()) sql = ‘UPDATE baidu_map_key_used SET today_used = today_used+1 ,update_time=%s WHERE key="%s" ‘ % ( localtime_, key) c.execute(sql) conn.commit() conn.close() dir_, dir_exception = ‘baidu_map_uid‘, ‘baidu_map_uid_exception‘ requested_file_list = [] requested_file_dir_str, requested_file_dir_exception_str = ‘%s\\%s\\‘ % (curPath, dir_), ‘%s\\%s\\‘ % ( curPath, dir_exception) requested_file_dir = os.listdir(requested_file_dir_str) def chk_if_requested_file(): for f in requested_file_dir: to_in = f.split(‘.txt‘)[0] if to_in not in requested_file_list: requested_file_list.append(to_in) chk_if_requested_file() def write_requested_res(request_name, str_, type_=‘.txt‘): fname = ‘%s%s%s‘ % (requested_file_dir_str, request_name, type_) # 上海市虹口区岳阳医院?.txt fname = fname.replace(‘?‘, ‘‘) with open(fname, ‘w‘, encoding=‘utf-8‘) as ft: ft.write(str_) print(‘ok‘, threading.get_ident(), request_name) def write_requested_exception_res(request_name, str_, type_=‘.txt‘): fname = ‘%s%s%s‘ % (requested_file_dir_exception_str, request_name, type_) # 上海市虹口区岳阳医院?.txt fname = fname.replace(‘?‘, ‘‘) with open(fname, ‘w‘, encoding=‘utf-8‘) as ft: ft.write(str_) request_dic = {} def gen_request_dic_list(): fname_source = ‘官方上传任务.csv_py170829093808-BD_request_name-REDUCTION170829142821‘ fname_open = ‘%s\\%s‘ % (curPath, fname_source) FEXCEL = ‘%s%s‘ % (fname_open, ‘.xlsx‘) data = xlrd.open_workbook(FEXCEL) table = data.sheets()[0] nrows, ncols = table.nrows, table.ncols for i in range(1, nrows): l = table.row_values(i) dbid, area_code, name_, request_name, type_, city, district, addr, street = l if city not in target_city_list: continue request_name_chk = ‘%s%s%s‘ % (city, district, request_name) if request_name_chk in requested_file_list: continue if city not in request_dic: request_dic[city] = {} if district not in request_dic[city]: request_dic[city][district] = {} request_dic[city][district] = [] if request_name not in request_dic[city][district]: request_dic[city][district].append(request_name) gen_request_dic_list() fname_source = ‘官方上传任务.csv_py170829093808-BD_request_name-REDUCTION170829142821‘ # http://api.map.baidu.com/place/v2/suggestion?query=瀛嘉天下®ion=重庆市&city_limit=true&output=json&ak=oy2Q7IluhhwTGlz6l8pXYv6a0m6hXxr1 base_url = ‘http://api.map.baidu.com/place/v2/suggestion?query=R-QUERY®ion=R-CITY&city_limit=true&output=json&ak=R-AK‘ def fun_(city): for district in request_dic[city]: for request_name in request_dic[city][district]: request_name_chk = ‘%s%s%s‘ % (city, district, request_name) chk_if_requested_file() if request_name_chk in requested_file_list: continue ak = db_get_one_effective() if ak == DB_KEY_EXHAUST: print(DB_KEY_EXHAUST) break else: url_ = base_url.replace(‘R-QUERY‘, request_name).replace(‘R-CITY‘, city).replace(‘R-AK‘, ak) try: bd_res_json_str = requests.get(url_).text db_update_one_today_used(ak) write_requested_res(request_name_chk, bd_res_json_str) except Exception: bd_res_json_str = ‘请求百度-异常‘ write_requested_exception_res(request_name_chk, bd_res_json_str) print(bd_res_json_str) class MyThread(threading.Thread): def __init__(self, func, args): threading.Thread.__init__(self) self.func, self.args = func, args def run(self): self.func(self.args) thread_sum = len(target_city_list) def main(): threads_list = [] for nloop in range(0, thread_sum, 1): city = target_city_list[nloop] thread_instance = MyThread(fun_, (city)) threads_list.append(thread_instance) for t in threads_list: t.setDaemon = False t.start() for t in threads_list: t.join() if __name__ == ‘__main__‘: main()
ORDER BY today_used ASC' % (MAX_USED_TIMES)
时间: 2024-11-12 14:01:20