源码网商城,靠谱的源码在线交易网站 我的订单 购物车 帮助

源码网商城

php获取网页标题和内容函数(不包含html标签)

  • 时间:2021-04-12 00:48 编辑: 来源: 阅读:
  • 扫一扫,手机访问
摘要:php获取网页标题和内容函数(不包含html标签)
[u]复制代码[/u] 代码如下:
function getPageContent($url) {           //$url='http://www.ttphp.com;           $pageinfo = array();           $pageinfo[content_type] = '';           $pageinfo[charset] = '';           $pageinfo[title] = '';           $pageinfo[description] = '';           $pageinfo[keywords] = '';           $pageinfo[body] = '';           $pageinfo['httpcode'] = 200;           $pageinfo['all'] = '';            $ch = curl_init();           curl_setopt($ch, CURLOPT_USERAGENT, "Mozilla/4.0 (compatible; MSIE 5.01; Windows NT 5.0)");           curl_setopt($ch, CURLOPT_RETURNTRANSFER,1);           curl_setopt($ch, CURLOPT_SSL_VERIFYHOST, 0);           curl_setopt($ch, CURLOPT_SSL_VERIFYPEER,0);           curl_setopt($ch, CURLOPT_TIMEOUT, 8);           curl_setopt($ch, CURLOPT_FILETIME, 1);           curl_setopt($ch, CURLOPT_FOLLOWLOCATION, 1);           //curl_setopt($ch, CURLOPT_HEADER, 1);                 curl_setopt($ch, CURLOPT_URL,$url);           $curl_start = microtime(true);           $store = curl_exec ($ch);           $curl_time = microtime(true) - $curl_start;           if( curl_error($ch) ) {               $pageinfo['httpcode'] = 505;  //gate way error               echo 'Curl error: ' . curl_error($ch) ."/n";               return $pageinfo;           }           //print_r(curl_getinfo($ch));           $pageinfo['httpcode'] = curl_getinfo($ch,CURLINFO_HTTP_CODE);           //echo curl_getinfo($ch,CURLINFO_CONTENT_TYPE)."/n";           $pageinfo[content_type] = curl_getinfo($ch,CURLINFO_CONTENT_TYPE);           if(intval($pageinfo['httpcode']) <> 200 or !preg_match('@text/html@',curl_getinfo($ch,CURLINFO_CONTENT_TYPE) )   ) {                   //print_r(curl_getinfo($ch) );                   //exit;                   return $pageinfo;           }           preg_match('/charset=([^/s/n/r]+)/i',curl_getinfo($ch,CURLINFO_CONTENT_TYPE),$matches); //从header 里取charset           if( trim($matches[1]) ) {               $pageinfo[charset] = trim($matches[1]);           }           //echo $pageinfo[charset];           //exit;           curl_close ($ch);           //echo $store;             //remove javascript           $store = preg_replace("/<mce:script.*><!-- (.*)<//script>/smUi",'',$store);           //remove link            $store = preg_replace("/<link/s+[^>]+>/smUi",'',$store);           //remove <!--  -->           $store = preg_replace("/<!--.*-->/smUi",'',$store);           //remove <style  </<style>           $store = preg_replace("/<style.*>(.*)<//style>/smUi",'',$store);           //remove 中文空格           $store = preg_replace("/ /",'',$store);           //remove 标点符号           //$store = preg_replace("/[/~`!@#$%^&*()_/-+={}|/[/]//;':"/</>/?/,/.//]/",'',$store);                      //preg_match("/<head.*>(.*)<//head>/smUi",$store, $matches);           //$head = $matches[1];           //echo $head. "/n";           //charset           if($pageinfo[charset] == '' ) {              preg_match('@<meta.+charset=([/w/-]+)[^>]*>@i',$store,$matches);              $pageinfo[charset] = trim($matches[1]);          }          //desctiption          preg_match('@<meta/s+name=/"*description/"*/s+content/s*=/s*([^/>]+)/*>@i',$store,$matches);          //print_r($matches);          $desc = trim($matches[1]);          $pageinfo[description] = str_replace("/"", '',$desc);            preg_match('@<meta/s+name=/"*keywords/"*/s+content/s*=/s*([^/>]+)/*>@i',$store,$matches);          //print_r($matches);          $keywords = trim($matches[1]);          $pageinfo[keywords] = str_replace("/"", '',$keywords);                    preg_match("/<title>(.*)<//title>/smUi",$store, $matches);          $pageinfo[title] = trim($matches[1]);          preg_match("/<body.*>(.*)<//body>/smUi",$store, $matches);          $pageinfo[body] = addslashes( replaceHtmlAndJs($matches[1]) ) ;          $pageinfo['all'] = addslashes( replaceHtmlAndJs($store) ) ;          //echo "charset = " . $pageinfo[charset] . "/n";          //print_r($pageinfo);          //exit;                    return $pageinfo;  }  /**   * 去掉所有的HTML标记和JavaScript标记   */  function replaceHtmlAndJs($document)   {            $document = trim($document);            if (strlen($document) <= 0)            {             return $document;            }            $search = array (                                                     "'<script[^>]*?>.*? // --></mce:script>'si",  // 去掉 javascript                             "'<[///!]*?[^<>]*?>'si",          // 去掉 HTML 标记                             "'[/r/n/s+]'",                // 去掉空白字符                             "'&(/w+);'i"              // 替换 HTML 实体                            );                    // 作为 PHP 代码运行            $replace = array ( "", "", "", ""  );            return @preg_replace ($search, $replace, $document);   } 
使用例子
[u]复制代码[/u] 代码如下:
$a = getPageContent(www.ttphp.com);   print_r($a);
  • 全部评论(0)
联系客服
客服电话:
400-000-3129
微信版

扫一扫进微信版
返回顶部