http://www.catalogueoflife.org/col/browse/classification
这是一个国外的生物信息网站
今天的代码可以抓取指定分类的信息(id,学名)
没有把多线程写进去,略失败...
运用:webclient,regex,io
项目在>>>开源中国
1 using System;
2 using System.Collections.Generic;
3 using System.ComponentModel;
4 using System.Data;
5 using System.Drawing;
6 using System.Linq;
7 using System.Text;
8 using System.Threading.Tasks;
9 using System.Windows.Forms;
10 using System.Net;
11 using System.Text.RegularExpressions;
12 using System.Threading;
13 using System.IO;
14 namespace cateoflife
15 {
16 public partial class Form1 : Form
17 {
18 WebClient wc = new WebClient();
19 int start;
20 int end;
21 string url;
22 string reg;
23 string msg;
24 int now = 1;
25 public Form1()
26 {
27 InitializeComponent();
28
29 }
30
31 private void button1_Click(object sender, EventArgs e)
32 {
33 start = int.Parse(textBox2.Text);
34
35 FileInfo fifo = new FileInfo(start+".txt");
36 FileStream fs= fifo.OpenWrite();
37 StreamWriter w = new StreamWriter(fs);
38 w.BaseStream.Seek(0, SeekOrigin.End);
39
40 end=(int.Parse(textBox3.Text)==0)?99999:int.Parse(textBox3.Text);
41 url = textBox1.Text;
42 reg = textBox4.Text;
43 wc.Encoding = Encoding.UTF8;
44 string Htm;
45 for (int i = start; i <= end; i++)
46 {
47 try
48 {
49 Htm = wc.DownloadString(url + i);
50 foreach (Match m in Regex.Matches(Htm, reg))
51 {
52 gettxt(m.ToString());
53 w.Write(msg);
54 w.Flush();
55 }
56 }
57 catch (Exception)
58 {
59 Htm = wc.DownloadString(url + i);
60 foreach (Match m in Regex.Matches(Htm, reg))
61 {
62 gettxt(m.ToString());
63 w.Write(msg);
64 w.Flush();
65 }
66 }
67
68 }
69 w.Close();
70 }
71 void gettxt(string html)
72 {
73 msg=Regex.Match(html,"(?<=/)\\d+").ToString()+"\t"+Regex.Match(html,"(?<=>)\\w+\\s*\\w+").ToString()+"\r\n";
74 }
75 }
76 }
时间: 2024-10-19 19:52:02