悠悠楠杉
网站页面
获取指定的内容,主要还是用到preg_match_all()函数,只执行一次正则表达式也可以用preg_match()函数!
<!DOCTYPE html>
<html>
<head>
<meta http-equiv="Content-Type" content="text/html; charset=UTF-8">
<meta http-equiv="Cache-Control" content="no-cache">
<title>测试</title>
</head>
<body>
<div class="box">测试</div>
<div class="box">测试2</div>
<div class="box">测试3</div>
<div class="box">测试4</div>
<div class="box">测试5</div>
</body>
</html>
$str = file_get_contents('1.html');
preg_match_all ('/<div class="box">(.*)<\/div>/U', $str, $pat_array);
// print_r($pat_array[1][0]);
for($i=0;$i<count($pat_array[0]);$i++){
print_r($pat_array[1][$i].'<br>');
}
// 输出结果:
// 测试
// 测试2
// 测试3
// 测试4
// 测试5
前后的//也可以用##的,如$regex1 = '#<strong>(.*?)</strong>#';
,用了#后,这里面的</strong>
就可以不用转义了!
$s='<a class="js-tongjic" href="afhdfhdfh.html" monitor-shortpv-c="215874">
<img src="/image/1.png"/>
<strong>123456</strong>
<img src="/image/2.png"/>
<strong>654321</strong></a>';
$regex = '/class="js-tongjic" href="(.*?)"[\s\S]*?src="(.*?)"[\s\S]*?<strong>(.*?)<\/strong>[\s\S]*?src="(.*?)"[\s\S]*?<strong>(.*?)<\/strong>/';
preg_match_all($regex, $s, $matche);
echo '<pre>';
print_r($matche);
curl把HTTP协议都封装成了很多函数,直接传相应参数即可,降低了编写HTTP协议字符串的难度。
前提:在php.ini中要开启curl扩展。
/**
* curl获取数据
*
* @param string $url 链接
* @param string|array $post post提交的数据
* @param string $cookie 模拟cookie
* @param string $referer 来源地址
* @param string $proxy 模拟ip或者代理ip
* @param integer $header 1显示请求头,0不显示
* @param string $userAgent 模拟用户浏览器信息
* @param array $httpheader 模拟请求头
* @param int $timeout 超时时间
* @return string|array 返回页面数据
*/
function get_curl($url, $post = '', $cookie = '', $referer = '', $proxy = '', $header = 0, $userAgent = '', $httpheader = [], $timeout = 10)
{
$curl = curl_init();
// 配置curl中的http协议->可配置的荐可以查PHP手册中的curl_
curl_setopt($curl, CURLOPT_URL, $url);
if ($post) {
// POST数据
curl_setopt($curl, CURLOPT_POST, 1);
// 把post的变量加上
if (is_array($post)) {
curl_setopt($curl, CURLOPT_POSTFIELDS, http_build_query($post));
} else {
curl_setopt($curl, CURLOPT_POSTFIELDS, $post);
}
if (is_string($post) && $arr = json_decode($post, true)) {
if (is_array($arr)) {
$httpheader[] = 'Content-Type: application/json; charset=utf-8';
$httpheader[] = 'Content-Length: ' . strlen($post);
}
}
}
if ($referer) {
$httpheader[] = 'Referer: ' . $referer; //模拟来路
$httpheader[] = 'Origin: ' . $referer;
} else {
$httpheader[] = 'Referer: ' . $url; //模拟来路
$httpheader[] = 'Origin: ' . $url;
}
if ($cookie) {
$httpheader[] = 'Cookie: ' . $cookie; //模拟cookie
}
if ($proxy) {
$proxy = explode(':', $proxy);
if (!empty($proxy[1])) {
curl_setopt($curl, CURLOPT_PROXY, $proxy[0]); //代理服务器地址
curl_setopt($curl, CURLOPT_PROXYPORT, $proxy[1]); //代理服务器端口
}
$httpheader[] = 'X-FORWARDED-FOR: ' . $proxy[0]; //模拟ip
$httpheader[] = 'CLIENT-IP: ' . $proxy[0]; //模拟ip
} else {
if(!empty($_SERVER)){
$httpheader[] = 'X-FORWARDED-FOR: ' . $_SERVER['REMOTE_ADDR']; //模拟ip
$httpheader[] = 'CLIENT-IP: ' . $_SERVER['REMOTE_ADDR']; //模拟ip
}
}
if ($header) {
curl_setopt($curl, CURLOPT_HEADER, TRUE); //获取响应头信息
}
if ($userAgent) {
$httpheader[] = 'User-Agent: ' . $userAgent; //模拟用户浏览器信息
} else {
$httpheader[] = 'User-Agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/125.0.0.0 Safari/537.36';
}
$parseUrl = parse_url($url);
if (!empty($parseUrl['host'])) {
$host = 'Host: ' . $parseUrl['host'];
if(!empty($parseUrl['port'])){
$host .= ':'.$parseUrl['port'];
}
$httpheader[] = $host;
}
curl_setopt($curl, CURLOPT_HTTPHEADER, $httpheader); //模拟请求头
curl_setopt($curl, CURLOPT_TIMEOUT, $timeout); //只需要设置一个秒的数量就可以
curl_setopt($curl, CURLOPT_RETURNTRANSFER, 1); //返回字符串,而非直接输出到屏幕上
curl_setopt($curl, CURLOPT_FOLLOWLOCATION, 1); //跟踪爬取重定向页面
curl_setopt($curl, CURLOPT_SSL_VERIFYPEER, false);
curl_setopt($curl, CURLOPT_SSL_VERIFYHOST, false);
curl_setopt($curl, CURLOPT_ENCODING, ''); //解决网页乱码问题
// 执行这个请求
$ret = curl_exec($curl);
if ($header) {
$headerSize = curl_getinfo($curl, CURLINFO_HEADER_SIZE);
$header = substr($ret, 0, $headerSize);
$body = substr($ret, $headerSize);
$ret = array();
$ret['header'] = $header;
$ret['body'] = $body;
}
curl_close($curl);
return $ret;
}
//get获取
$data = get_curl('https://www.baidu.com/');
//post获取
$data = get_curl('https://www.baidu.com/',[id => 1,'url' => 'www.zzwws.cn']);// 或者get_curl('https://www.baidu.com/','id=1&url=www.zzwws.cn');
//发起json数据post请求
$data = get_curl('https://www.zzwws.cn/',"{'id': 1,'url': 'https://www.zzwws.cn/'}");
function redirect_url($url) {
$ch = curl_init();
curl_setopt($ch, CURLOPT_URL, $url);
// 不需要页面内容
curl_setopt($ch, CURLOPT_NOBODY, 1);
// 不直接输出
curl_setopt($ch, CURLOPT_USERAGENT,"Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.198 Safari/537.36");
curl_setopt($ch, CURLOPT_RETURNTRANSFER, 1);
// 返回最后的Location
curl_setopt($ch, CURLOPT_FOLLOWLOCATION, 1);
curl_exec($ch);
$info = curl_getinfo($ch,CURLINFO_EFFECTIVE_URL);
curl_close($ch);
return $info;
}