Files
tuiguangzhushou/application/common/service/CeArticle.php
2020-08-06 15:26:41 +08:00

275 lines
12 KiB
PHP
Raw Blame History

This file contains invisible Unicode characters

This file contains invisible Unicode characters that are indistinguishable to humans but may be processed differently by a computer. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

<?php
// +------------------------------------------------+
// |http://www.cjango.com |
// +------------------------------------------------+
// | 修复BUG不是一朝一夕的事情等我喝醉了再说吧 |
// +------------------------------------------------+
// | Author: 小陈叔叔 <Jason.Chen> |
// +------------------------------------------------+
namespace app\common\service;
use app\common\model\Article as ArticleModel;
use tools\Str;
class CeArticle extends _Init
{
/**
* 采集公众号文章
* @param [type] $url [description]
* @param integer $uid [description]
* @return [type] [description]
*/
public static function collect($url, $uid = 1)
{
$map = [
'url' => $url,
'uid' => $uid,
];
$userAgent = 'Mozilla/5.0 (iPhone; CPU iPhone OS 6_0 like Mac OS X) AppleWebKit/536.26 (KHTML, like Gecko) Version/6.0 Mobile/10A403 Safari/8536.25';
$info = ArticleModel::where($map)->find();
if ($info) {
$info->delete();
return $info->id;
} else {
$info = new ArticleModel;
$html = file_get_contents($url);
// $html = http($url, 'GET', '', '', $userAgent);
//获取文章标题
preg_match_all("/id=\"activity-name\">(.*)<\/h2>/is", $html, $title);
//获取文章内容部分
preg_match_all("/id=\"js_content\">(.*)<script/iUs", $html, $content, PREG_PATTERN_ORDER);
//格式化标题
$title = str_replace("\r\n", "", str_replace(" ", "", $title[1][0]));
$title = self::get_title($title);
//拼接正确的内容标签
$content = "<div id='js_content' class='ce-padding-sm'>" . $content[1][0];
//获取所有图片地址
preg_match_all('/<[img|IMG].*?>/is', $content, $img);
/* $pattern = "/<[img|IMG].*?src=[\'|\"](.*?(?:[\.gif|\.jpg|\.png]))[\'|\"].*?[\/]?>/";
preg_match_all('/<img.*?src="(.*?)".*?>/is', $content, $img);
*/
//遍历所有图片,采集到服务器
$i = 0;
$thumb = '';
$img_pre = uniqid(); // 确保每篇文章的图片前缀一致,路径按照日期存储
$dir = './uploads/collect/' . date('Y-m/d/');
foreach ($img[0] as $key => $value) {
if (strpos($value, 'iframe') !== false) {
continue;
}
if (strpos($value, 'data-src') !== false) {
preg_match_all('/data-src=".*?"/is', $value, $imim);
} elseif (strpos($value, 'data-croporisrc') !== false) {
preg_match_all('/data-croporisrc=".*?"/is', $value, $imim);
} elseif (strpos($value, 'src') !== false) {
preg_match_all('/src=".*?"/is', $value, $imim);
}
$vv = $imim[0][0];
$str = explode('"', $vv);
$fstr = explode('/', $str[1]);
$allname = $fstr[4];
if (strrpos($str[1], 'wx_fmt=') > 0) {
$ext = substr($str[1], strrpos($str[1], 'wx_fmt=') + 7);
$allname .= '.' . $ext;
} else {
$allname .= '.png';
}
try {
if (!is_dir($dir)) {
mkdir($dir, 0755, true);
}
$image = file_get_contents($str[1]);
$realfile = $dir . $allname;
file_put_contents($realfile, $image);
preg_match_all('/style=".*?"/is', $value, $style);
if (empty($style[0][0]) || $style[0][0] == 'style=""') {
$styleAttr = 'style="width: auto !important; height: auto !important; visibility: visible !important;"';
} else {
$styleAttr = $style[0][0];
}
//更替图片文件地址为服务器图片地址
$content = str_replace($value, '<img src="' . ltrim($realfile, '.') . '" ' . $styleAttr . '>', $content);
if ($i == 0) {
$thumb = ltrim($dir, '.') . $allname;
}
$i++;
} catch (Exception $e) {
continue;
}
// $content = str_replace($str[1], ltrim($realfile, '.'), $content);
// if ($i == 0) {
// $thumb = ltrim($realfile, '.');
// }
// $i++;
}
//处理背景图
$content = str_replace("background-image: url", "background-image:url", $content);
$preg_title = "|background-image:url\((.*)\);|U";
preg_match_all($preg_title, $content, $backimage);
if (!empty($backimage)) {
foreach ($backimage[1] as $key => $value) {
$str = str_replace("&quot;", "", $value);
$fstr = explode('/', $str);
$allname = $fstr[4];
if (strrpos($str, 'wx_fmt=') > 0) {
$ext = substr($str, strrpos($str, 'wx_fmt=') + 7);
$allname .= '.' . $ext;
} else {
$allname .= '.png';
}
$image = file_get_contents($str);
$realfile = $dir . $allname;
file_put_contents($realfile, $image);
$content = str_replace($str, ltrim($realfile, '.'), $content);
}
}
//修正视频内容
$content = str_replace("preview.html", "player.html", $content);
//重新设置 iframe
preg_match_all('/<iframe[^>]*\s+data-src="([^"]*)"[^>]*>/is', $content, $matched);
if (!empty($matched[0])) {
foreach ($matched[0] as $key => $value) {
$src = explode('&', $matched['1'][$key]);
$str = '<p style="max-width:100%; margin:14px"><iframe frameborder="0" src="' . $src['0'] . '&auto=0" style="z-index: 1; width: 100% ! important; height: 231.75px ! important; overflow: hidden;" class="video_iframe" scrolling="no"></iframe></p>';
$content = str_replace($value, $str, $content);
}
}
$description = self::get_description($content);
//获取公众号名称
preg_match_all('/var nickname = \"(.*?)\";/si', $html, $m);
$nickname = $m[1][0];
//获取公众号头像
preg_match_all('/var round_head_img = \"(.*?)\";/si', $html, $m);
$head_img = $m[1][0];
$data = [
'title' => $title,
'content' => $content,
'description' => $description,
'category_id' => 0,
'storage_id' => 0,
'thumb' => $thumb,
'status' => 1,
'create_time' => time(),
'update_time' => 0,
'url' => $url,
'nickname' => $nickname,
'head_img' => $head_img,
'uid' => $uid,
'click' => 0,
];
$info->save($data);
if ($info) {
return $info->id;
} else {
return false;
}
}
}
public static function get_title($chars)
{
$find = stripos($chars, "document.write");
if ($find > 0) {
$title = explode('"', $chars);
$title = $title[7];
} else {
$title = $find;
}
if (empty($title)) {
$title = explode('</h2>', $chars);
$title = $title[0];
}
return $title;
}
public static function get_description($content)
{
$content = self::clearHtml($content);
$content = strip_tags($content);
$content = str_replace("'", '', $content);
// preg_match_all('/[\x{4e00}-\x{9fff}]+/u', $content, $matches);
// $content = join('', $matches[0]);
$des = self::trimall($content);
$des = Str::msubstr($des, 0, 140);
return $des;
}
//删除html 标签
public static function strip_html_tags($tags, $str)
{
$html = array();
foreach ($tags as $tag) {
$html[] = '/<' . $tag . '.*?>[\s|\S]*?<\/' . $tag . '>/';
$html[] = '/<' . $tag . '.*?>/';
}
$data = preg_replace($html, '', $str);
return $data;
}
public static function clearHtml($descclear)
{
$descclear = str_replace("\r", "", $descclear); //过滤换行
$descclear = str_replace("\n", "", $descclear); //过滤换行
$descclear = str_replace("\t", "", $descclear); //过滤换行
$descclear = str_replace("\r\n", "", $descclear); //过滤换行
$descclear = str_replace("&nbsp;", "", $descclear); //过滤&nbsp;
$descclear = preg_replace("/\s+/", " ", $descclear); //过滤多余回车
$descclear = preg_replace("/<[ ]+/si", "<", $descclear); //过滤<__("<"号后面带空格)
$descclear = preg_replace("/<\!--.*?-->/si", "", $descclear); //过滤html注释
$descclear = preg_replace("/<(\!.*?)>/si", "", $descclear); //过滤DOCTYPE
$descclear = preg_replace("/<(\/?html.*?)>/si", "", $descclear); //过滤html标签
$descclear = preg_replace("/<(\/?head.*?)>/si", "", $descclear); //过滤head标签
$descclear = preg_replace("/<(\/?meta.*?)>/si", "", $descclear); //过滤meta标签
$descclear = preg_replace("/<(\/?body.*?)>/si", "", $descclear); //过滤body标签
$descclear = preg_replace("/<(\/?link.*?)>/si", "", $descclear); //过滤link标签
$descclear = preg_replace("/<(\/?form.*?)>/si", "", $descclear); //过滤form标签
$descclear = preg_replace("/cookie/si", "COOKIE", $descclear); //过滤COOKIE标签
$descclear = preg_replace("/<(applet.*?)>(.*?)<(\/applet.*?)>/si", "", $descclear); //过滤applet标签
$descclear = preg_replace("/<(\/?applet.*?)>/si", "", $descclear); //过滤applet标签
$descclear = preg_replace("/<(style.*?)>(.*?)<(\/style.*?)>/si", "", $descclear); //过滤style标签
$descclear = preg_replace("/<(\/?style.*?)>/si", "", $descclear); //过滤style标签
$descclear = preg_replace("/<(title.*?)>(.*?)<(\/title.*?)>/si", "", $descclear); //过滤title标签
$descclear = preg_replace("/<(\/?title.*?)>/si", "", $descclear); //过滤title标签
$descclear = preg_replace("/<(object.*?)>(.*?)<(\/object.*?)>/si", "", $descclear); //过滤object标签
$descclear = preg_replace("/<(\/?objec.*?)>/si", "", $descclear); //过滤object标签
$descclear = preg_replace("/<(noframes.*?)>(.*?)<(\/noframes.*?)>/si", "", $descclear); //过滤noframes标签
$descclear = preg_replace("/<(\/?noframes.*?)>/si", "", $descclear); //过滤noframes标签
$descclear = preg_replace("/<(i?frame.*?)>(.*?)<(\/i?frame.*?)>/si", "", $descclear); //过滤frame标签
$descclear = preg_replace("/<(\/?i?frame.*?)>/si", "", $descclear); //过滤frame标签
$descclear = preg_replace("/<(script.*?)>(.*?)<(\/script.*?)>/si", "", $descclear); //过滤script标签
$descclear = preg_replace("/<(\/?script.*?)>/si", "", $descclear); //过滤script标签
$descclear = preg_replace("/javascript/si", "Javascript", $descclear); //过滤script标签
$descclear = preg_replace("/vbscript/si", "Vbscript", $descclear); //过滤script标签
$descclear = preg_replace("/on([a-z]+)\s*=/si", "On\\1=", $descclear); //过滤script标签
$descclear = preg_replace("/&#/si", "&", $descclear); //过滤script标签如javAsCript:alert();
//使用正则替换
$pat = "/<(\/?)(script|i?frame|style|html|body|li|i|map|title|img|link|span|u|font|table|tr|b|marquee|td|strong|div|a|meta|\?|\%)([^>]*?)>/isU";
$descclear = preg_replace($pat, "", $descclear);
return $descclear;
}
//删除空格
public static function trimall($str)
{
$qian = array(" ", " ", "\t", "\n", "\r");
$hou = array("", "", "", "", "");
return str_replace($qian, $hou, $str);
}
}