275 lines
12 KiB
PHP
275 lines
12 KiB
PHP
<?php
|
||
// +------------------------------------------------+
|
||
// |http://www.cjango.com |
|
||
// +------------------------------------------------+
|
||
// | 修复BUG不是一朝一夕的事情,等我喝醉了再说吧! |
|
||
// +------------------------------------------------+
|
||
// | Author: 小陈叔叔 <Jason.Chen> |
|
||
// +------------------------------------------------+
|
||
namespace app\common\service;
|
||
|
||
use app\common\model\Article as ArticleModel;
|
||
use tools\Str;
|
||
|
||
class CeArticle extends _Init
|
||
{
|
||
/**
|
||
* 采集公众号文章
|
||
* @param [type] $url [description]
|
||
* @param integer $uid [description]
|
||
* @return [type] [description]
|
||
*/
|
||
public static function collect($url, $uid = 1)
|
||
{
|
||
$map = [
|
||
'url' => $url,
|
||
'uid' => $uid,
|
||
];
|
||
|
||
$userAgent = 'Mozilla/5.0 (iPhone; CPU iPhone OS 6_0 like Mac OS X) AppleWebKit/536.26 (KHTML, like Gecko) Version/6.0 Mobile/10A403 Safari/8536.25';
|
||
|
||
$info = ArticleModel::where($map)->find();
|
||
if ($info) {
|
||
$info->delete();
|
||
return $info->id;
|
||
} else {
|
||
$info = new ArticleModel;
|
||
$html = file_get_contents($url);
|
||
// $html = http($url, 'GET', '', '', $userAgent);
|
||
//获取文章标题
|
||
preg_match_all("/id=\"activity-name\">(.*)<\/h2>/is", $html, $title);
|
||
//获取文章内容部分
|
||
preg_match_all("/id=\"js_content\">(.*)<script/iUs", $html, $content, PREG_PATTERN_ORDER);
|
||
//格式化标题
|
||
$title = str_replace("\r\n", "", str_replace(" ", "", $title[1][0]));
|
||
$title = self::get_title($title);
|
||
//拼接正确的内容标签
|
||
$content = "<div id='js_content' class='ce-padding-sm'>" . $content[1][0];
|
||
|
||
//获取所有图片地址
|
||
preg_match_all('/<[img|IMG].*?>/is', $content, $img);
|
||
/* $pattern = "/<[img|IMG].*?src=[\'|\"](.*?(?:[\.gif|\.jpg|\.png]))[\'|\"].*?[\/]?>/";
|
||
preg_match_all('/<img.*?src="(.*?)".*?>/is', $content, $img);
|
||
*/
|
||
//遍历所有图片,采集到服务器
|
||
$i = 0;
|
||
$thumb = '';
|
||
$img_pre = uniqid(); // 确保每篇文章的图片前缀一致,路径按照日期存储
|
||
$dir = './uploads/collect/' . date('Y-m/d/');
|
||
foreach ($img[0] as $key => $value) {
|
||
if (strpos($value, 'iframe') !== false) {
|
||
continue;
|
||
}
|
||
if (strpos($value, 'data-src') !== false) {
|
||
preg_match_all('/data-src=".*?"/is', $value, $imim);
|
||
} elseif (strpos($value, 'data-croporisrc') !== false) {
|
||
preg_match_all('/data-croporisrc=".*?"/is', $value, $imim);
|
||
} elseif (strpos($value, 'src') !== false) {
|
||
preg_match_all('/src=".*?"/is', $value, $imim);
|
||
}
|
||
|
||
$vv = $imim[0][0];
|
||
$str = explode('"', $vv);
|
||
$fstr = explode('/', $str[1]);
|
||
$allname = $fstr[4];
|
||
|
||
if (strrpos($str[1], 'wx_fmt=') > 0) {
|
||
$ext = substr($str[1], strrpos($str[1], 'wx_fmt=') + 7);
|
||
$allname .= '.' . $ext;
|
||
} else {
|
||
$allname .= '.png';
|
||
}
|
||
try {
|
||
if (!is_dir($dir)) {
|
||
mkdir($dir, 0755, true);
|
||
}
|
||
$image = file_get_contents($str[1]);
|
||
$realfile = $dir . $allname;
|
||
file_put_contents($realfile, $image);
|
||
preg_match_all('/style=".*?"/is', $value, $style);
|
||
|
||
if (empty($style[0][0]) || $style[0][0] == 'style=""') {
|
||
$styleAttr = 'style="width: auto !important; height: auto !important; visibility: visible !important;"';
|
||
} else {
|
||
$styleAttr = $style[0][0];
|
||
|
||
}
|
||
//更替图片文件地址为服务器图片地址
|
||
$content = str_replace($value, '<img src="' . ltrim($realfile, '.') . '" ' . $styleAttr . '>', $content);
|
||
if ($i == 0) {
|
||
$thumb = ltrim($dir, '.') . $allname;
|
||
}
|
||
$i++;
|
||
} catch (Exception $e) {
|
||
continue;
|
||
}
|
||
|
||
// $content = str_replace($str[1], ltrim($realfile, '.'), $content);
|
||
// if ($i == 0) {
|
||
// $thumb = ltrim($realfile, '.');
|
||
// }
|
||
// $i++;
|
||
|
||
}
|
||
|
||
//处理背景图
|
||
$content = str_replace("background-image: url", "background-image:url", $content);
|
||
$preg_title = "|background-image:url\((.*)\);|U";
|
||
preg_match_all($preg_title, $content, $backimage);
|
||
if (!empty($backimage)) {
|
||
foreach ($backimage[1] as $key => $value) {
|
||
$str = str_replace(""", "", $value);
|
||
$fstr = explode('/', $str);
|
||
$allname = $fstr[4];
|
||
if (strrpos($str, 'wx_fmt=') > 0) {
|
||
$ext = substr($str, strrpos($str, 'wx_fmt=') + 7);
|
||
$allname .= '.' . $ext;
|
||
} else {
|
||
$allname .= '.png';
|
||
}
|
||
$image = file_get_contents($str);
|
||
$realfile = $dir . $allname;
|
||
file_put_contents($realfile, $image);
|
||
$content = str_replace($str, ltrim($realfile, '.'), $content);
|
||
}
|
||
}
|
||
//修正视频内容
|
||
$content = str_replace("preview.html", "player.html", $content);
|
||
//重新设置 iframe
|
||
preg_match_all('/<iframe[^>]*\s+data-src="([^"]*)"[^>]*>/is', $content, $matched);
|
||
if (!empty($matched[0])) {
|
||
foreach ($matched[0] as $key => $value) {
|
||
$src = explode('&', $matched['1'][$key]);
|
||
$str = '<p style="max-width:100%; margin:14px"><iframe frameborder="0" src="' . $src['0'] . '&auto=0" style="z-index: 1; width: 100% ! important; height: 231.75px ! important; overflow: hidden;" class="video_iframe" scrolling="no"></iframe></p>';
|
||
$content = str_replace($value, $str, $content);
|
||
}
|
||
}
|
||
|
||
$description = self::get_description($content);
|
||
//获取公众号名称
|
||
preg_match_all('/var nickname = \"(.*?)\";/si', $html, $m);
|
||
$nickname = $m[1][0];
|
||
//获取公众号头像
|
||
preg_match_all('/var round_head_img = \"(.*?)\";/si', $html, $m);
|
||
$head_img = $m[1][0];
|
||
$data = [
|
||
'title' => $title,
|
||
'content' => $content,
|
||
'description' => $description,
|
||
'category_id' => 0,
|
||
'storage_id' => 0,
|
||
'thumb' => $thumb,
|
||
'status' => 1,
|
||
'create_time' => time(),
|
||
'update_time' => 0,
|
||
'url' => $url,
|
||
'nickname' => $nickname,
|
||
'head_img' => $head_img,
|
||
'uid' => $uid,
|
||
'click' => 0,
|
||
];
|
||
|
||
$info->save($data);
|
||
if ($info) {
|
||
return $info->id;
|
||
} else {
|
||
return false;
|
||
}
|
||
}
|
||
}
|
||
|
||
public static function get_title($chars)
|
||
{
|
||
$find = stripos($chars, "document.write");
|
||
|
||
if ($find > 0) {
|
||
$title = explode('"', $chars);
|
||
$title = $title[7];
|
||
} else {
|
||
$title = $find;
|
||
}
|
||
|
||
if (empty($title)) {
|
||
$title = explode('</h2>', $chars);
|
||
$title = $title[0];
|
||
}
|
||
return $title;
|
||
}
|
||
|
||
public static function get_description($content)
|
||
{
|
||
$content = self::clearHtml($content);
|
||
$content = strip_tags($content);
|
||
$content = str_replace("'", '', $content);
|
||
|
||
// preg_match_all('/[\x{4e00}-\x{9fff}]+/u', $content, $matches);
|
||
// $content = join('', $matches[0]);
|
||
$des = self::trimall($content);
|
||
$des = Str::msubstr($des, 0, 140);
|
||
return $des;
|
||
}
|
||
|
||
//删除html 标签
|
||
public static function strip_html_tags($tags, $str)
|
||
{
|
||
$html = array();
|
||
foreach ($tags as $tag) {
|
||
$html[] = '/<' . $tag . '.*?>[\s|\S]*?<\/' . $tag . '>/';
|
||
$html[] = '/<' . $tag . '.*?>/';
|
||
}
|
||
$data = preg_replace($html, '', $str);
|
||
return $data;
|
||
}
|
||
|
||
public static function clearHtml($descclear)
|
||
{
|
||
$descclear = str_replace("\r", "", $descclear); //过滤换行
|
||
$descclear = str_replace("\n", "", $descclear); //过滤换行
|
||
$descclear = str_replace("\t", "", $descclear); //过滤换行
|
||
$descclear = str_replace("\r\n", "", $descclear); //过滤换行
|
||
$descclear = str_replace(" ", "", $descclear); //过滤
|
||
$descclear = preg_replace("/\s+/", " ", $descclear); //过滤多余回车
|
||
$descclear = preg_replace("/<[ ]+/si", "<", $descclear); //过滤<__("<"号后面带空格)
|
||
$descclear = preg_replace("/<\!--.*?-->/si", "", $descclear); //过滤html注释
|
||
$descclear = preg_replace("/<(\!.*?)>/si", "", $descclear); //过滤DOCTYPE
|
||
$descclear = preg_replace("/<(\/?html.*?)>/si", "", $descclear); //过滤html标签
|
||
$descclear = preg_replace("/<(\/?head.*?)>/si", "", $descclear); //过滤head标签
|
||
$descclear = preg_replace("/<(\/?meta.*?)>/si", "", $descclear); //过滤meta标签
|
||
$descclear = preg_replace("/<(\/?body.*?)>/si", "", $descclear); //过滤body标签
|
||
$descclear = preg_replace("/<(\/?link.*?)>/si", "", $descclear); //过滤link标签
|
||
$descclear = preg_replace("/<(\/?form.*?)>/si", "", $descclear); //过滤form标签
|
||
$descclear = preg_replace("/cookie/si", "COOKIE", $descclear); //过滤COOKIE标签
|
||
$descclear = preg_replace("/<(applet.*?)>(.*?)<(\/applet.*?)>/si", "", $descclear); //过滤applet标签
|
||
$descclear = preg_replace("/<(\/?applet.*?)>/si", "", $descclear); //过滤applet标签
|
||
$descclear = preg_replace("/<(style.*?)>(.*?)<(\/style.*?)>/si", "", $descclear); //过滤style标签
|
||
$descclear = preg_replace("/<(\/?style.*?)>/si", "", $descclear); //过滤style标签
|
||
$descclear = preg_replace("/<(title.*?)>(.*?)<(\/title.*?)>/si", "", $descclear); //过滤title标签
|
||
$descclear = preg_replace("/<(\/?title.*?)>/si", "", $descclear); //过滤title标签
|
||
$descclear = preg_replace("/<(object.*?)>(.*?)<(\/object.*?)>/si", "", $descclear); //过滤object标签
|
||
$descclear = preg_replace("/<(\/?objec.*?)>/si", "", $descclear); //过滤object标签
|
||
$descclear = preg_replace("/<(noframes.*?)>(.*?)<(\/noframes.*?)>/si", "", $descclear); //过滤noframes标签
|
||
$descclear = preg_replace("/<(\/?noframes.*?)>/si", "", $descclear); //过滤noframes标签
|
||
$descclear = preg_replace("/<(i?frame.*?)>(.*?)<(\/i?frame.*?)>/si", "", $descclear); //过滤frame标签
|
||
$descclear = preg_replace("/<(\/?i?frame.*?)>/si", "", $descclear); //过滤frame标签
|
||
$descclear = preg_replace("/<(script.*?)>(.*?)<(\/script.*?)>/si", "", $descclear); //过滤script标签
|
||
$descclear = preg_replace("/<(\/?script.*?)>/si", "", $descclear); //过滤script标签
|
||
$descclear = preg_replace("/javascript/si", "Javascript", $descclear); //过滤script标签
|
||
$descclear = preg_replace("/vbscript/si", "Vbscript", $descclear); //过滤script标签
|
||
$descclear = preg_replace("/on([a-z]+)\s*=/si", "On\\1=", $descclear); //过滤script标签
|
||
$descclear = preg_replace("/&#/si", "&#", $descclear); //过滤script标签,如javAsCript:alert();
|
||
//使用正则替换
|
||
$pat = "/<(\/?)(script|i?frame|style|html|body|li|i|map|title|img|link|span|u|font|table|tr|b|marquee|td|strong|div|a|meta|\?|\%)([^>]*?)>/isU";
|
||
$descclear = preg_replace($pat, "", $descclear);
|
||
return $descclear;
|
||
}
|
||
|
||
//删除空格
|
||
public static function trimall($str)
|
||
{
|
||
$qian = array(" ", " ", "\t", "\n", "\r");
|
||
$hou = array("", "", "", "", "");
|
||
return str_replace($qian, $hou, $str);
|
||
}
|
||
|
||
}
|