这个题是阿里的一个电话面试题,我想的头一个解决方案,有点限制,后面再写优化的
实验数据,python从百度抓得:
# -*- coding: utf-8 -*- """ Spyder Editor This is a temporary script file. """ import urllib2 import re import os #connect to a URL #一页的搜索结果中url大概是200个左右 file_url = open('url.txt','ab+') #搜索框里的东西,这块可以设置成数字好让每次搜索的结果不一样 search = '123' url = "http://www.baidu.com/s?wd="+search def setUrlToFile(): website = urllib2.urlopen(url) #read html code html = website.read() #use re.findall to get all the links links = re.findall('"((http|ftp)s?://.*?)"', html) for s in links: print s[0] if len(s[0]) < 256: file_url.write(s[0]+'\r\n') #收集实验数据 for i in range(0,50): setUrlToFile() file_url.close() ###需要重新打开再读一下 file_url = open('url.txt','r') file_lines = len(file_url.readlines()) print "there are %d url in %s" %(file_lines,file_url) file_url.close()
c++ 写的读 url.xml,放到map里面,对map<string , int>的value进行排序,得到前100个,运行一下也就55s,还是很快的,url长度进行了限制小于256个字符:
#pragma once /* //计算代码段运行时间的类 // */ #include <iostream> #ifndef ComputeTime_h #define ComputeTime_h //单位毫秒 class ComputeTime { private: int Initialized; __int64 Frequency; __int64 BeginTime; public: bool Avaliable(); double End(); bool Begin(); ComputeTime(); virtual ~ComputeTime(); }; #endif #include "stdafx.h" #include "ComputeTime.h" #include <iostream> #include <Windows.h> ComputeTime::ComputeTime() { Initialized=QueryPerformanceFrequency((LARGE_INTEGER *)&Frequency); } ComputeTime::~ComputeTime() { } bool ComputeTime::Begin() { if(!Initialized) return 0; return QueryPerformanceCounter((LARGE_INTEGER *)&BeginTime); } double ComputeTime::End() { if(!Initialized) return 0; __int64 endtime; QueryPerformanceCounter((LARGE_INTEGER *)&endtime); __int64 elapsed = endtime-BeginTime; return ((double)elapsed/(double)Frequency)*1000.0; //单位毫秒 } bool ComputeTime::Avaliable() { return Initialized; } // sortUrl.cpp : 定义控制台应用程序的入口点。 // #include "stdafx.h" //#include <utility> #include <vector> #include <map> #include <fstream> #include <iostream> #include <string> #include <algorithm> #include "ComputeTime.h" using namespace std; map<string,int> urlfrequency; typedef pair<string, int> PAIR; struct CmpByValue { bool operator()(const PAIR& lhs, const PAIR& rhs) { return lhs.second > rhs.second; } }; void find_largeTH(map<string,int> urlfrequency) { //把map中元素转存到vector中 ,按照value排序 vector<PAIR> url_quency_vec(urlfrequency.begin(), urlfrequency.end()); sort(url_quency_vec.begin(), url_quency_vec.end(), CmpByValue()); //url_quency_vec.size() for (int i = 0; i != 100; ++i) { cout<<url_quency_vec[i].first<<endl; cout<<url_quency_vec[i].second<<endl; } } //urlheap的建立过程,URL插入时候存在的 void insertUrl(string url) { pair<map<string ,int>::iterator, bool> Insert_Pair; Insert_Pair = urlfrequency.insert(map<string, int>::value_type(url,1)); if (Insert_Pair.second == false) { (Insert_Pair.first->second++); } } int _tmain(int argc, _TCHAR* argv[]) { fstream URLfile; char buffer[1024]; URLfile.open("url.txt",ios::in|ios::out|ios::binary); if (! URLfile.is_open()) { cout << "Error opening file"; exit (1); } else { cout<<"open file success!"<<endl; } ComputeTime cp; cp.Begin(); int i = 0; while (!URLfile.eof()) { URLfile.getline (buffer,1024); //cout << buffer << endl; string temp(buffer); //cout<<i++<<endl; insertUrl(temp); } find_largeTH(urlfrequency); cout<<"running time: "<<cp.End()<<"ms"<<endl; getchar(); //system("pause"); return 0; }
实验结果:
版权声明:本文为博主原创文章,未经博主允许不得转载。
时间: 2024-10-27 13:43:07