public function insertAction() { ini_set(‘max_execution_time‘, ‘0‘); // error_reporting(E_ALL); // ini_set(‘display_errors‘, ‘Off‘); // 插入之前首先更新目录文件 $getHomeList = $this->getXmlAction(); $arr_code = array( 1 => ‘插入成功‘, -1 => ‘插入失败!请检查再试!‘, -2 => ‘获取xml文件失败!请检查再试!‘, ); showApiCode($arr_code); //把目录改成对应的ID $getHomeList = array_combine(array_column($getHomeList, ‘name‘), array_column($getHomeList, ‘id‘)); // 添加颜色字段 $color = array( 0 => ‘#a56d57‘, 1 => ‘#4c889c‘, 2 => ‘#658965‘, ); //连接数据库 $ArticleModel = new ArticleModel(); //创建dom对象 $dom = new DOMDocument(); //创建抓取对象 $Utils_CaptureWebContent = new Utils_CaptureWebContent(‘‘); //加载xml.rss文件 // $xml = json_decode(file_get_contents(DATA_DIR . ‘infomation.json‘)); $xml = json_decode(file_get_contents(DATA_DIR . ‘infomation.json‘), true); foreach ($xml as &$value) { // 获取标题 $title = $value[‘title‘]; // 获取描述 $summary = $value[‘description‘]; //获取分类名字 $category_name = $value[‘category‘]; $send_time = strtotime($value[‘pubDate‘]); $utime = $ctime = time(); //添加一级分类id $article_category = $getHomeList[$category_name]; $category_color = $color[$article_category % 3]; // 测试的链接 $content_url = $value[‘link‘]; $id = sprintf("%u", crc32($content_url)); $out = $this->getDataAction($content_url); $out = preg_replace(array(‘/<head>([\s\S]+?)<\/head>/i‘), array(‘<head><meta http-equiv="Content-Type" content="text/html;charset=utf-8"></head>‘), $out[‘output‘]); @$dom->loadHTML($out); $xpath = new DOMXPath($dom); // 截取最后一次/后面的字符,根据这个长度来判断属于哪一个类型 $str = strlen(strrchr($content_url, ‘/‘)); $html = $Utils_CaptureWebContent->captureGet($content_url); $html = $Utils_CaptureWebContent->formatHtml($html); // 对网站进行分类,分三类,分别处理,获取其中的from_site,content,category_name(二级分类) if ($str < 2) { // 第一类(非标准链接):http://kjs.mep.gov.cn/hjbhbz/bzwb/dqhjbh/jcgfffbz/ } else if ($str < 10) { // 第二类(标准链接):http://www.gdczepb.gov.cn/detail/24441 $site = $xpath->query("//div[@class=‘cdaylist‘]/ul/li"); //获取来源地址 $from_site = $site->item(0)->nodeValue; if (strlen(trim($from_site)) < 10) { $from_site = ‘来源:资讯‘; } // 获取二级分类外面的那个div $cate_html = $Utils_CaptureWebContent->matchHtmlElement("div", "class", "cnav", $html); $cate_html = preg_replace(‘/ /‘, ‘‘, $cate_html); $category_name = substr($cate_html, strripos($cate_html, ‘>‘) + 1); if (!$category_name) { //如果上面获取不到,则说明二级分类在a标签里面,获取最后一个a标签里面的内容 $cate_name = $xpath->query("//div[@class=‘cnav‘]/a"); $category_name = $cate_name->item($cate_name->length - 1)->nodeValue; } //获取内容 $content = $Utils_CaptureWebContent->matchHtmlElement("div", "class", "contents", $html); } else { // 第三类(次标准链接):http://kjs.mep.gov.cn/hjbhbz/bzwb/stzl/201109/t20110919_217415.htm //获取来源地址,没有数据,直接指定来源为科技司 $from_site = ‘来源:科学技术司‘; $content = $Utils_CaptureWebContent->matchAllHtmlElement("table", "class", "txtnormal", $html); $content = join($content[0], ‘‘); $category_name = $Utils_CaptureWebContent->matchAllHtmlElement("a", "class", "dtdir12 CurrChnlCls", $html); $category_name = $category_name[1][3]; } //内容里面的图片也有多种src, //第一种: upload ; //第二种:/upload ; //第三种:./upload/文件名; //第四种:直接文件; //第五种:./文件名 这种; //正则匹配href和src $src_pat = ‘/src="(\.?\/?upload.+?)"/‘; $href_pat = ‘/href="(\.?\/?upload.+?)"/‘; // 获取前缀 $host = parse_url($content_url); $host = ‘http://‘ . $host[‘host‘] . ‘/‘; $host_name = dirname($content_url) . ‘/‘; $content = preg_replace(array("/style=\".+?\"/i", "/width=\".+?\"/i", "/<style([\s\S]+?)<\/style>/i", "/<script([\s\S]+?)<\/script>/i"), ‘‘, $content); //去除样式 // $content = preg_replace(array($src_pat, $href_pat), array($host . "$1", $host . "$1"), $content); $content = preg_replace(array($src_pat, $href_pat), array(‘src="‘ . $host . "$1" . ‘"‘, ‘href="‘ . $host . "$1" . ‘"‘), $content); $src_pat2 = ‘/src="([^http].*?)"/is‘; $href_pat2 = ‘/href="([^http].*?)"/is‘; // 第二次替换,把非http开头的都加上detail替换掉 $content = preg_replace(array($src_pat2, $href_pat2), array(‘src="‘ . $host_name . "$1" . ‘"‘, ‘href="‘ . $host_name . "$1" . ‘"‘), $content); $src_one = ‘/<img[^>]*src="([^>"]*)"/is‘; preg_match($src_one, $content, $cover_url); $old_data = $ArticleModel->getItem($id); $content = empty(trim($content)) ? $old_data[‘content‘] : htmlspecialchars($content); $data_check = sprintf("%u", crc32(join(‘‘, array($title, $content)))); $params = array( "id" => $id, "link" => $content_url, "article_category" => $article_category, "title" => $title, "summary" => $summary, "content" => $content, "send_time" => $send_time, "from_site" => $from_site, "ctime" => $ctime, "utime" => $utime, "category_name" => $category_name, "category_color" => $category_color, "cover_url" => $cover_url[1], "data_check" => $data_check, ); // echo "<pre>"; // print_r($params); // echo "</pre>"; $i = 0; if (!empty($content)) { try { $ArticleModel->add($params); echo $i; } catch (Exception $e) { $old_check = $old_data[‘data_check‘]; if ($old_check != $data_check) { $ArticleModel->update($params, " id = {$id} "); $i++; // echo ‘<h1 color="red">插入的数据与之前的不样!执行更新操作。</h1><br>‘; } } } else { continue; } $need = array( ‘title‘ => $title, ‘content‘ => htmlspecialchars_decode($content), ‘from_site‘ => $from_site, ‘send_time‘ => $send_time, ); $data_test = array( ‘info‘ => $need, ); ob_start(); $this->display("/article/infoContent.phtml", $data_test); $id_html = ob_get_clean(); file_put_contents(PROJECT_ROOT . ‘/html/article/a‘ . $id . ‘.html‘, $id_html); usleep(700000); } printf("本次更新了 %s 条数据", $i); }
php原生态生成静态缓存,配合crontab定时刷新缓存,不需要第三方模板
时间: 2024-09-29 01:17:00