Удобный парсер на ReactPHP + multicurl с возможностью загрузки прокси листа и его проверкой
установка
composer require grab/spider "dev-master"
Пример парсера
<?php
require __DIR__ . '/../vendor/autoload.php';
class HackerNewCrawler extends \Grab\Spider
{
public function taskGenerator()
{
$range = array_map(function($item) {
return sprintf('https://news.ycombinator.com/news?p=%d', $item);
}, range(1, 4)) ;
foreach ($range as $url) {
$this->task('page', [
'url' => $url,
'max_request' => 10,
]);
}
}
public function taskPage($parser, $task)
{
$links = $parser->find('.storylink');
foreach ($links as $link) {
$this->task('topic', [
'url' => $link->getAttribute('href'),
'curl_config' => [
CURLOPT_TIMEOUT => 60,
],
'max_request' => 10,
]);
}
}
public function taskTopic($parser, $task)
{
$products = $parser->find('title');
echo trim($products[0]->text()) . PHP_EOL;
}
}
$bot = new HackerNewCrawler();
$bot->debug = true;
$bot->setCurlSetting([
CURLOPT_USERAGENT => 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.71 Safari/537.36',
]);
//$bot->loadProxy(__DIR__ . '/proxy_list.txt');
$bot->run();
<?php
/**
* @param string $method Метод (GET|POST|PUT|DELETE)
* @param string $url URL запроса
* @param array $data Тело запроса (POST, PUT)
* @param array $gets Дополнителные GET-параметры
*
* @return mixed
*/
function callRestApi($method, $url, $data = [], $gets = [])
{
$curl = curl_init();
switch (strtolower($method)) {
case 'post':
curl_setopt($curl, CURLOPT_POST, 1);
if ($data) {
curl_setopt($curl, CURLOPT_POSTFIELDS, $data);
}
break;
case 'put':
curl_setopt($curl, CURLOPT_CUSTOMREQUEST, 'PUT');
curl_setopt($curl, CURLOPT_HTTPHEADER, ['X-HTTP-Method-Override: PUT']);
if ($data) {
curl_setopt($curl, CURLOPT_POSTFIELDS, http_build_query($data));
}
break;
case 'delete':
curl_setopt($curl, CURLOPT_CUSTOMREQUEST, 'DELETE');
curl_setopt($curl, CURLOPT_HTTPHEADER, ['X-HTTP-Method-Override: DELETE']);
break;
}
if ($gets) {
$url = sprintf("%s?%s", $url, http_build_query($gets));
}
curl_setopt($curl, CURLOPT_URL, $url);
curl_setopt($curl, CURLOPT_RETURNTRANSFER, 1);
$result = curl_exec($curl);
curl_close($curl);
return $result;
}