Публикации - Парсинг сайтов на Laravel

Имплемент для парсинга Onliner.by

Особенностью парсинга Onliner.by является то, что для парсинга мы используем api, предоставляющие данные в формате json. Доступны следующие ссылки для парсинга:

https://catalog.api.onliner.by/search/название_каталога

Например:

  • https://catalog.api.onliner.by/search/mobile
  • https://catalog.api.onliner.by/search/printer

Список всех категорий сайта мы можем взять по ссылке - https://catalog.onliner.by

Сам имплемент выглядит так

namespace App\Parser;

use Symfony\Component\DomCrawler\Crawler;
use App\Parser\ParseContract;
use App\Product;
use App\Category;
use App\CatalogOnliner;
use Auth;

class Onliner implements ParseContract
{
    use ParseTrait;
    public $str = '';
    public $url = '';
    public $obj;
    public $char;
    public $crawler;
    public $id;

    public function __construct()
    {
        $file = file_get_contents('http://catalog.onliner.by/');
        $this->crawler = new Crawler($file);
    }

    public function getParse($data)
    {
        $divo = explode('/', $data->url);
        $end = end($divo);
        $pos = str_ireplace('http://catalog.onliner.by/', '', $data->url);
        $ask = strpos($pos, '?');
        if ($ask === false) {
            $vopros = '?';
        } else {
            $vopros = '&';
        }
        $http = 'https://catalog.api.onliner.by/search/' . $end . $vopros . 'page=1';
        $ch = curl_init();
        curl_setopt($ch, CURLOPT_URL, $http);
        curl_setopt($ch, CURLOPT_RETURNTRANSFER, 1);
        $result = curl_exec($ch);
        curl_close($ch);
        $ke = 0;
        $data_arr = (array)json_decode($result);
        $cat = CatalogOnliner::where('name', $data->name)->first();
        $prod = Product::where('cat_id', $cat->id)->first();
        if (!is_null($prod)) {
            $prod->delete();
        }
        foreach ($data_arr as $key => $value) {
            if (is_array($value)) {

                foreach ($value as $val_key => $data_value) {
                    $val_value = (array)$data_value;
                    $site_id = (isset($val_value['id'])) ? $val_value['id'] : '';
                    $site_key = (isset($val_value['key'])) ? $val_value['key'] : '';
                    $name = (isset($val_value['full_name'])) ? $val_value['full_name'] : '';
                    $pic_value = (array)$val_value['images'];
                    $header_picture = (isset($pic_value['header'])) ? $pic_value['header'] : '';
                    $icon_picture = (isset($pic_value['icon'])) ? $pic_value['icon'] : '';
                    $description = (isset($val_value['description'])) ? $val_value['description'] : '';
                    $site_product_url = (isset($val_value['html_url'])) ? $val_value['html_url'] : '';
                    $rev_value = (array)$val_value['reviews'];
                    $rating = (isset($rev_value['rating'])) ? (integer)$rev_value['rating'] : '';
                    $site_reviews_url = (isset($rev_value['html_url'])) ? $rev_value['html_url'] : '';
                    $price_value = (array)$val_value['prices'];
                    $price_min = (isset($price_value['min'])) ? $price_value['min'] : '';
                    $price_max = (isset($price_value['max'])) ? $price_value['max'] : '';
                    $site_prices_url = (isset($price_value['html_url'])) ? $price_value['html_url'] : '';
                    $currency = (isset($price_value['currency_sign'])) ? $price_value['currency_sign'] : '';
                    $forum_value = (array)$val_value['forum'];
                    $site_forum_url = (isset($forum_value['topic_url'])) ? $forum_value['topic_url'] : '';
                    $site_api_url = (isset($val_value['url'])) ? $val_value['url'] : '';
                    //получаем характеристики товара
                    $this->setCharacter($site_product_url, ".product-specs__table");
                    $character = $this->getCharacter();
                    //добавляем товар в базу данных
                    $prod = new Product();
                    $prod->cat_id = (isset($cat->id)) ? $cat->id : 0;
                    $prod->site = 'onliner.by';
                    $prod->site_product_id = $site_id;
                    $prod->site_key = $site_key;
                    $prod->name = $name;
                    $prod->header_picture = $header_picture;
                    $prod->icon_picture = $icon_picture;
                    $prod->description = $description;
                    $prod->site_product_url = $site_product_url;
                    $prod->site_reviews_url = $site_reviews_url;
                    $prod->rating = $rating;
                    $prod->count = 1;
                    $prod->price_min = $price_min;
                    $prod->price_max = $price_max;
                    $prod->site_prices_url = $site_prices_url;
                    $prod->currency = $currency;
                    $prod->site_forum_url = $site_forum_url;
                    $prod->site_api_url = $site_api_url;
                    $prod->character = $character;
                    $prod->user_id = Auth::user()->id;
                    $prod->save();

                    sleep(1);
                    $ke++;
                }
            }
        }
        $body = $ke . " товаров";


        return $body;
    }


    public function catalog()
    {
        $arr = [];
        $this->crawler->filter(".catalog-navigation-classifier__item")->each(function (Crawler $node, $i) {
            $id = ($node->attr('data-id') != null) ? $node->attr('data-id') : 0;
            $cat = CatalogOnliner::firstOrNew(['id' => $id]);
            $cat->name = $node->filter('.catalog-navigation-classifier__item-title-wrapper')->text();
            $cat->type = 'link';
            $cat->parent_id = 0;
            $cat->save();
        });
        return $arr;
    }

    public function setCharacter($path, $selector)
    {
        $file = file_get_contents($path);
        $crawler = new Crawler($file);
        $table = $this->html($crawler, $selector);
        $this->char = $table;
        return true;
    }

    public function h2Catalog()
    {
        $parent = CatalogOnliner::all();
        foreach ($parent as $one) {
            $this->id = $one->id;
            $this->crawler->filter(".catalog-navigation-list__category[data-id=" . $one->id . "]")->each(function (Crawler $node, $i) {
                $node->filter('.catalog-navigation-list__group-title')->each(function (Crawler $node, $i) {
                    //echo $node->text();
                    $cat = new CatalogOnliner;
                    $cat->name = $node->text();
                    $cat->parent_id = $this->id;
                    $cat->type = 'h2';
                    $cat->save();
                });

            });
        }
    }

    public function listCatalog()
    {
        $parent = CatalogOnliner::all();
        foreach ($parent as $one) {
            $this->id = $one->id;
            $this->crawler->filter(".catalog-navigation-list__category[data-id=" . $one->id . "]")->each(function (Crawler $node, $i) {
                $node->filter('.catalog-navigation-list__group')->each(function (Crawler $node, $i) {
                    $cat = new CatalogOnliner;
                    $cat->name = $node->filter('.catalog-navigation-list__group-title')->text();
                    $cat->category_type = 'h2';
                    $cat->parent_id = $this->id;
                    $cat->type = 'h2';
                    //$cat->save();

                    $node->filter('.catalog-navigation-list__link-inner a')->each(function (Crawler $node, $i) {
                        $cat = new CatalogOnliner;
                        $cat->name = $node->text();
                        $cat->url = $node->attr('href');
                        $cat->parent_id = $this->id;
                        $cat->type = '';
                        //$cat->save();
                    });
                });

            });
        }


    }

    public function getCharacter()
    {
        return $this->char;
    }
}

Количество комментариев: 0

Для того, чтобы оставить коментарий необходимо зарегистрироваться