webMagic 爬虫

public class MoviePaperPageProcessor implements PageProcessor {
private Site page = Site.me().setRetryTimes(3).setSleepTime(1000);

public Site getSite() {
return page;
}
String url = "jdbc:oracle:thin:@192.168.2.161:1521:orcl" ;
String username = "hecv_ay" ;
String password = "000000" ;

public void process(Page page) {
ChineseCharToEn cte = new ChineseCharToEn();
// System.out.println("获取拼音首字母："+ cte.getAllFirstLetter("西琉璃村委会"));

// String xml = page.getHtml().toString();
List<String> codes=page.getHtml().xpath("//table[@class=‘villagetable‘]//tr[@class=‘villagetr‘]/td[1]/text()").all();
List<String> names=page.getHtml().xpath("//table[@class=‘villagetable‘]//tr[@class=‘villagetr‘]/td[3]/text()").all();
System.out.println(codes.get(3)+"---"+names.get(3));

try{
Connection conn =
DriverManager.getConnection(url , username , password ) ;
conn.setAutoCommit(false);
PreparedStatement ps =conn.prepareStatement("INSERT INTO HECV_AY.CM_ADMINISTRATIVE_AREA (ID, CODE, NAME, SHORTNAME, LOOKUP, DEGREE, POS, PARENT, ANCESTOR, DEPTH, TERMINAL, DELETED, CREATED_TIME, LAST_MODIFIED_TIME)" +
"VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)");
for(int i=0;i<codes.size();i++){
ps.setString(1, UUID.randomUUID().toString());
ps.setString(2, codes.get(i));
ps.setString(3, names.get(i));
ps.setString(4, names.get(i));
ps.setString(5, cte.getAllFirstLetter(names.get(i)));
ps.setString(6, "70");
ps.setString(7, 10*(i+1)+"");
ps.setString(8, "f3505dfd-b255-4df6-84e5-09dd03b666db");
ps.setString(9, "(0d0c03f6-934f-40b2-bb4c-b27846f5e987),(12f196e1-d4b8-4c87-8e37-a5bf15d69222),(14ea5270-8e17-4071-9e2f-b34434be1b4b),(40043731-734c-4e65-aba9-5e5928644931),(f3505dfd-b255-4df6-84e5-09dd03b666db)");
ps.setInt(10, 6);
ps.setInt(11, 0);
ps.setInt(12, 0);
ps.setDate(13, new java.sql.Date(new Date().getTime()));
ps.setDate(14, new java.sql.Date(new Date().getTime()));

ps.addBatch();
}
ps.executeBatch();
conn.commit();
conn.close();
}catch(SQLException se){
System.out.println("数据库连接失败！");
se.printStackTrace() ;
}

}

public static void main(String[] args) {
Spider.create(new MoviePaperPageProcessor())
.addUrl("http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2015/41/05/26/410526103.html")
.thread(5).run();
}

}

//获取汉字首字母

public class ChineseCharToEn {

private final static int[] li_SecPosValue = { 1601, 1637, 1833, 2078, 2274,
2302, 2433, 2594, 2787, 3106, 3212, 3472, 3635, 3722, 3730, 3858,
4027, 4086, 4390, 4558, 4684, 4925, 5249, 5590 };
private final static String[] lc_FirstLetter = { "a", "b", "c", "d", "e",
"f", "g", "h", "j", "k", "l", "m", "n", "o", "p", "q", "r", "s",
"t", "w", "x", "y", "z" };

/**
* 取得给定汉字串的首字母串,即声母串
* @param str 给定汉字串
* @return 声母串
*/
public String getAllFirstLetter(String str) {
if (str == null || str.trim().length() == 0) {
return "";
}

String _str = "";
for (int i = 0; i < str.length(); i++) {
_str = _str + this.getFirstLetter(str.substring(i, i + 1));
}

return _str;
}

/**
* 取得给定汉字的首字母,即声母
* @param chinese 给定的汉字
* @return 给定汉字的声母
*/
public String getFirstLetter(String chinese) {
if (chinese == null || chinese.trim().length() == 0) {
return "";
}
chinese = this.conversionStr(chinese, "GB2312", "ISO8859-1");

if (chinese.length() > 1) // 判断是不是汉字
{
int li_SectorCode = (int) chinese.charAt(0); // 汉字区码
int li_PositionCode = (int) chinese.charAt(1); // 汉字位码
li_SectorCode = li_SectorCode - 160;
li_PositionCode = li_PositionCode - 160;
int li_SecPosCode = li_SectorCode * 100 + li_PositionCode; // 汉字区位码
if (li_SecPosCode > 1600 && li_SecPosCode < 5590) {
for (int i = 0; i < 23; i++) {
if (li_SecPosCode >= li_SecPosValue[i]
&& li_SecPosCode < li_SecPosValue[i + 1]) {
chinese = lc_FirstLetter[i];
break;
}
}
} else // 非汉字字符,如图形符号或ASCII码
{
chinese = this.conversionStr(chinese, "ISO8859-1", "GB2312");
chinese = chinese.substring(0, 1);
}
}

return chinese;
}

/**
* 字符串编码转换
* @param str 要转换编码的字符串
* @param charsetName 原来的编码
* @param toCharsetName 转换后的编码
* @return 经过编码转换后的字符串
*/
private String conversionStr(String str, String charsetName,String toCharsetName) {
try {
str = new String(str.getBytes(charsetName), toCharsetName);
} catch (UnsupportedEncodingException ex) {
System.out.println("字符串编码转换异常：" + ex.getMessage());
}
return str;
}

public static void main(String[] args) {
ChineseCharToEn cte = new ChineseCharToEn();
System.out.println("获取拼音首字母："+ cte.getAllFirstLetter("西琉璃村委会"));
}

}

时间： 2024-10-15 05:29:26

webMagic 爬虫

webMagic 爬虫的相关文章

webmagic爬虫程序

WebMagic爬虫框架及javaEE SSH框架将数据保存到数据库（二）

webmagic爬虫报错，求解答!

基于webmagic的爬虫小应用--爬取知乎用户信息

使用Scrapy爬虫框架简单爬取图片并保存本地(妹子图）

前后端分离的爬虫小项目

Java 解析chm文件实战(原创)

github清理，记录一些有趣的项目

Movie Hell诞生之路