知识大全 php正则匹配获取指定url网页页面超级链接地址

Posted 2022-07-21 内容

篇首语：怀抱观古今，寝食展戏谑。本文由小常识网(cha138.com)小编为大家整理，主要介绍了知识大全 php正则匹配获取指定url网页页面超级链接地址相关的知识，希望对你有一定的参考价值。

　　在数据采集与页面分析中常需要抓取给定url页面的内容或者第二第三层次深度页面内容

　　这里是一个测试例子的实现仅供参考

　　代码如下

　　 /* 匹配给定页面链接 return:array match[link content all] */ function match_links($host $document) $pattern = /<a( *?)href="( *?)"( *?)>( *?)</a>/i ; preg_match_all($pattern $document $m); return $m;

　　preg_match_all(" <s*as *?hrefs*=s*([" ])?(?( )( *?) |([^s>]+))[^>]*>?( *?)</a> isx" $document $links); while(list($key $val) = each($links[ ])) if(!empty($val)) if(preg_match("//" $val)) $match[ link ][] = $val; else $match[ link ][] = $host $val; while(list($key $val) = each($links[ ])) if(!empty($val)) if(preg_match("//" $val)) $match[ link ][] = $val; else $match[ link ][] = $host $val; while(list($key $val) = each($links[ ])) if(!empty($val)) $match[ content ][] = $val; while(list($key $val) = each($links[ ])) if(!empty($val)) $match[ all ][] = $val; return $match[ link ];

　　/* 从给定url中获取页面文本内容 */ function get_content_from_url($url) $str = @file_get_contents($url); if(mb_check_encoding($str "GBK")) $str = iconv("GBK" "UTF " $str); $str = strip_tags($str); // 过滤标签 /* $str = preg_replace( "@<script( *?)</script>@is" "" $str ); $str = preg_replace( "@<iframe( *?)</iframe>@is" "" $str ); $str = preg_replace( "@<style( *?)</style>@is" "" $str ); $str = preg_replace( "@<( *?)>@is" "" $str ); */ //过滤非汉字字符 preg_match_all( /[x e x fff]+/u $str $matches); $str = join( $matches[ ]); if(!$str) return NULL; return $str;

　　function get_content($url $depth) if(!$url || $depth < ) return false;

　　while($depth > ) $str = @file_get_contents($url); if(!$str) return false;

　　$parseurl = parse_url($url); if($parseurl[ host ]) $host = $parseurl[scheme] "://" $parseurl[ host ]; $arrlink = match_links($host $str); $arr_url = array_unique($arrlink);

　　$depth ; foreach($arr_url as $url) $content = get_content($url $depth); //递归调用

cha138/Article/program/PHP/201311/21224