Skip to content

Commit d45f370

Browse files
committed
Web crawler implemented
1 parent 5c20b57 commit d45f370

14 files changed

+296
-120
lines changed

README.md

+83-1
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,8 @@ This library allows you
1515
$ composer require snippetify/snippet-sniffer
1616
```
1717

18+
### Snippet Sniffer
19+
1820
```php
1921
use Snippetify\SnippetSniffer\SnippetSniffer;
2022

@@ -199,13 +201,93 @@ SnippetSniffer::create(...)
199201
...
200202
```
201203

204+
### Sniptbot
205+
206+
Sniptbot allows you to extract all snippets from a website by crawling it.
207+
208+
```php
209+
use Snippetify\SnippetSniffer\WebCrawler;
210+
211+
// Optional
212+
$config = [...];
213+
214+
// @return Snippetify\SnippetSniffer\Common\MetaSnippetCollection[]
215+
$snippets = WebCrawler::create($config)->fetch(['your uri']);
216+
```
217+
218+
219+
220+
#### Configuration reference
221+
222+
```php
223+
$config = [
224+
// Required
225+
// Search engine api configuration keys
226+
'provider' => [
227+
"cx" => "your google Search engine ID",
228+
"key" => "your google API key"
229+
'name' => 'provider name (google)',
230+
],
231+
// Optional
232+
// Useful for adding meta information to each snippet
233+
'app' => [
234+
"name" => "your App name",
235+
'version' => 'your App version',
236+
],
237+
// Optional
238+
// Useful for logging
239+
'logger' => [
240+
"name" => "logger name",
241+
'file' => 'logger file path',
242+
],
243+
// Optional
244+
// Useful for scraping
245+
"html_tags" => [
246+
"snippet" => "pre[class] code, div[class] code, .highlight pre, code[class]", // Tags to fetch snippets
247+
"index" => "h1, h2, h3, h4, h5, h6, p, li" // Tags to index
248+
],
249+
// Optional
250+
// Useful for adding new scrapers
251+
// The name must be the website host without the scheme i.e. not https://foo.com but foo.com
252+
"scrapers" => [
253+
"scraper_name" => ScraperClass::class,
254+
"scraper_2_name" => Scraper2Class::class // You can add as many as you want
255+
],
256+
// Optional
257+
// Useful for adding new providers
258+
"providers" => [
259+
"provider_name" => ProviderClass::class,
260+
"provider_2_name" => Provider2Class::class // You can add as many as you want
261+
],
262+
// Optional
263+
// Useful for web crawling
264+
// Please follow the link below for more information as we use Spatie crawler
265+
// https://github.com/spatie/crawler
266+
"crawler" => [
267+
"langs" => ['en'],
268+
"profile" => CrawlSubdomainsAndUniqueUri::class,
269+
"user_agent" => 'your user agent',
270+
"concurrency" => 10,
271+
"ignore_robots" => false,
272+
"maximum_depth" => null,
273+
"execute_javascript" => false,
274+
"maximum_crawl_count" => null,
275+
"parseable_mime_types" => 'text/html',
276+
"maximum_response_size" => 1024 * 1024 * 3,
277+
"delay_between_requests" => 250,
278+
]
279+
];
280+
```
281+
202282
## Changelog
203283

204284
Please see [CHANGELOG](https://github.com/snippetify/snippet-sniffer/blob/master/CHANGELOG.md) for more information what has changed recently.
205285

206286
## Testing
207287

208-
You must set the **PROVIDER_NAME**, **PROVIDER_CX**, **PROVIDER_KEY** keys in phpunit.xml file before running tests.
288+
You must set the **PROVIDER_NAME**, **PROVIDER_CX**, **PROVIDER_KEY**, **CRAWLER_URI**, **DEFAULT_SCRAPER_URI**, **STACKOVERFLOW_SCRAPER_URI** keys in phpunit.xml file before running tests.
289+
290+
**Important:** Those links must contains at least one snippet otherwise the tests will failed. The **Stackoverflow** uri must be a question link with an accepted answer otherwise the tests will failed.
209291

210292
```bash
211293
composer test

phpunit.xml.dist

+3
Original file line numberDiff line numberDiff line change
@@ -21,5 +21,8 @@
2121
<server name="PROVIDER_CX" value=""/>
2222
<server name="PROVIDER_KEY" value=""/>
2323
<server name="APP_ENV" value="testing"/>
24+
<server name="CRAWLER_URI" value=""/>
25+
<server name="DEFAULT_SCRAPER_URI" value=""/>
26+
<server name="STACKOVERFLOW_SCRAPER_URI" value=""/>
2427
</php>
2528
</phpunit>

src/Common/Snippet.php

+1-1
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,7 @@
1818
*/
1919
class Snippet
2020
{
21-
const WIKI_TYPE = 'wiki';
21+
const ROBOT_TYPE = 'robot';
2222

2323
/**
2424
* @var string

src/Common/WebPage.php

+4-9
Original file line numberDiff line numberDiff line change
@@ -19,22 +19,17 @@ class WebPage
1919
/**
2020
* @var string
2121
*/
22-
public $siteName;
23-
24-
/**
25-
* @var Psr\Http\Message\UriInterface
26-
*/
27-
public $siteUri;
22+
public $title;
2823

2924
/**
3025
* @var string
3126
*/
32-
public $title;
33-
27+
public $summary;
28+
3429
/**
3530
* @var string
3631
*/
37-
public $summary;
32+
public $lang;
3833

3934
/**
4035
* @var Psr\Http\Message\UriInterface

src/Core.php

+7-4
Original file line numberDiff line numberDiff line change
@@ -11,23 +11,26 @@
1111

1212
namespace Snippetify\SnippetSniffer;
1313

14+
use Snippetify\SnippetSniffer\Profiles\CrawlSubdomainsAndUniqueUri;
15+
1416
class Core
1517
{
1618
public const APP_NAME = 'Snippet sniffer';
1719
public const APP_TYPE = 'snippetify-sniffer';
1820
public const APP_VERSION = '1.1.0';
1921

2022
// Crawler
21-
public const CRAWLER_PROFILE = \Spatie\Crawler\CrawlSubdomains::class;
23+
public const CRAWLER_LANG = 'en';
24+
public const CRAWLER_PROFILE = CrawlSubdomainsAndUniqueUri::class;
2225
public const CRAWLER_CONCURENCY = 10;
2326
public const CRAWLER_IGNORE_ROBOTS = true;
24-
public const CRAWLER_MAXIMUM_DEPTH = 50;
27+
public const CRAWLER_MAXIMUM_DEPTH = null;
2528
public const CRAWLER_EXECUTE_JAVASCRIPT = false;
26-
public const CRAWLER_MAXIMUM_CRAWL_COUNT = 1500;
29+
public const CRAWLER_MAXIMUM_CRAWL_COUNT = null;
2730
public const CRAWLER_PARSEABLE_MIME_TYPES = 'text/html';
2831
public const CRAWLER_MAXIMUM_RESPONSE_SIZE = 1024 * 1024 * 3;
2932
public const CRAWLER_DELAY_BETWEEN_REQUESTS = 250;
30-
public const CRAWLER_USER_AGENT = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.14; rv:65.0) Gecko/20100101 Firefox/65.0';
33+
public const CRAWLER_USER_AGENT = 'Mozilla/5.0 (compatible; Sniptbot/1.0; +http://www.snippetify.com/bot)';
3134

3235
/**
3336
* Html Snippet tags

src/Observers/SnippetCrawlObserver.php

+44-8
Original file line numberDiff line numberDiff line change
@@ -48,32 +48,51 @@ public function __construct(WebCrawler $webCrawler)
4848
*/
4949
public function crawled(UriInterface $url, ResponseInterface $response, ?UriInterface $foundOnUrl = null)
5050
{
51-
$crawler = new Crawler((string)$response->getBody());
51+
// Must crawl uri once
52+
if ($this->webCrawler->isCrawled($url)) return;
5253

54+
// Create crawler from reponse body
55+
$crawler = new Crawler((string) $response->getBody());
56+
57+
// Must contains snippets
5358
if (0 === $crawler->filter($this->webCrawler->getConfig()['html_tags']['snippet'])->count()) return;
5459

60+
// Only crawl specified langs
61+
if (!$this->hasLang($crawler)) return;
62+
63+
// New meta snippet
64+
$metaSnippet = new MetaSnippetCollection([
65+
'uri' => (string) $url,
66+
'snippets' => $this->webCrawler->getScraper($url->getHost())->fetchFromDocument($crawler)
67+
]);
68+
69+
// Must contains snippets
70+
if (0 === count($metaSnippet->snippets)) return;
71+
72+
// Get page description
5573
$summTags = 'meta[name="description"], meta[property="og:description"]';
5674
$summary = 0 === $crawler->filter($summTags)->count() ? '' : $crawler->filter($summTags)->attr('content');
5775

58-
$metaSnippet = new MetaSnippetCollection(['uri' => $url]);
59-
6076
try {
6177
$metaSnippet->page = new WebPage([
62-
'link' => $url,
6378
'summary' => $summary,
79+
'link' => (string) $url,
6480
'title' => $crawler->filter('title')->text(),
81+
'lang' => $crawler->filter('html')->attr('lang'),
6582
'metaTags' => $crawler->filter('meta')
6683
->each(function ($v) { return [$v->attr('name') => $v->attr('content')]; }),
67-
'plainText' => $crawler->filter($this->webCrawler->getConfig()['html_tags']['index'])
68-
->each(function ($v) { return ' ' . $v->text(); }),
84+
'plainText' => implode(' ', $crawler->filter($this->webCrawler->getConfig()['html_tags']['index'])
85+
->each(function ($v) { return $v->text(); })),
6986
]);
7087
} catch(\Exception $e) {
7188
$this->webCrawler->logError($requestException);
7289
}
7390

74-
$metaSnippet->snippets = $this->webCrawler->getScraper($url->getHost())->fetchFromDocument($crawler);
91+
// Save meta snippet
92+
$this->webCrawler->addUniqueSnippet($metaSnippet);
7593

76-
$this->webCrawler->addSnippet($metaSnippet);
94+
// Save crawled uri
95+
$this->webCrawler->addToCrawledUris($url);
7796
}
7897

7998
/**
@@ -87,4 +106,21 @@ public function crawlFailed(UriInterface $url, RequestException $requestExceptio
87106
{
88107
$this->webCrawler->logError($requestException);
89108
}
109+
110+
/**
111+
* @param Symfony\Component\DomCrawler\Crawler $crawler
112+
* @return bool
113+
*/
114+
private function hasLang(Crawler $crawler): bool
115+
{
116+
$has = false;
117+
118+
foreach ($this->webCrawler->getConfig()['crawler']['langs'] as $value) {
119+
if (false !== stripos($crawler->filter('html')->attr('lang'), $value)) {
120+
$has = true;
121+
}
122+
}
123+
124+
return $has;
125+
}
90126
}
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,27 @@
1+
<?php
2+
3+
namespace Snippetify\SnippetSniffer\Profiles;
4+
5+
use Psr\Http\Message\UriInterface;
6+
use Spatie\Crawler\CrawlSubdomains;
7+
use Snippetify\SnippetSniffer\WebCrawler;
8+
9+
class CrawlSubdomainsAndUniqueUri extends CrawlSubdomains
10+
{
11+
/**
12+
* @var Snippetify\SnippetSniffer\WebCrawler
13+
*/
14+
private $webCrawler;
15+
16+
public function __construct($baseUrl, WebCrawler $webCrawler)
17+
{
18+
parent::__construct($baseUrl);
19+
20+
$this->webCrawler = $webCrawler;
21+
}
22+
23+
public function shouldCrawl(UriInterface $url): bool
24+
{
25+
return $this->isSubdomainOfHost($url) && !$this->webCrawler->isCrawled($url);
26+
}
27+
}

src/Scrapers/AbstractScraper.php

+2-2
Original file line numberDiff line numberDiff line change
@@ -185,11 +185,11 @@ protected function fetchSnippet(Crawler $node, Crawler $crawler, array $meta = [
185185
return new Snippet([
186186
'tags' => $tags,
187187
'code' => $node->text(),
188-
'type' => Snippet::WIKI_TYPE,
188+
'type' => Snippet::ROBOT_TYPE,
189189
'title' => $crawler->filter('title')->text(),
190190
'description' => $desc,
191191
'meta' => [
192-
'url' => $node->getUri(),
192+
'url' => $crawler->getUri(),
193193
'target' => $this->config['app'],
194194
'website' => $this->fetchWebsiteMetadata($crawler)
195195
]

src/Scrapers/StackoverflowScraper.php

+2-2
Original file line numberDiff line numberDiff line change
@@ -86,12 +86,12 @@ protected function fetchSnippet(Crawler $node, Crawler $crawler, array $meta = [
8686
return new Snippet([
8787
'tags' => $tags,
8888
'title' => $title,
89-
'type' => Snippet::WIKI_TYPE,
89+
'type' => Snippet::ROBOT_TYPE,
9090
'code' => $node->filter('code')->text(),
9191
'description' => $desc,
9292
'meta' => [
9393
'accepted' => $meta['accepted'],
94-
'url' => $node->getUri(),
94+
'url' => $crawler->getUri(),
9595
'target' => $this->config['app'],
9696
'website' => $this->fetchWebsiteMetadata($crawler)
9797
]

src/SnippetSniffer.php

+9-3
Original file line numberDiff line numberDiff line change
@@ -39,6 +39,7 @@ final class SnippetSniffer
3939
/**
4040
* @param array $config
4141
* @return void
42+
* @throws InvalidArgumentException
4243
*/
4344
public function __construct(array $config)
4445
{
@@ -78,6 +79,7 @@ public static function create(array $config): self
7879
* @param string $name
7980
* @param string $class
8081
* @return self
82+
* @throws InvalidArgumentException
8183
*/
8284
public function addScraper(string $name, string $class): self
8385
{
@@ -96,6 +98,7 @@ public function addScraper(string $name, string $class): self
9698
* @param string $name
9799
* @param string $class
98100
* @return self
101+
* @throws InvalidArgumentException
99102
*/
100103
public function addProvider(string $name, string $class): self
101104
{
@@ -123,14 +126,15 @@ public function fetch(string $query, array $meta = []): array
123126
foreach ($urls as $url) {
124127
$snippets = array_merge($snippets, $this->scraper($url->getHost())->fetch($url));
125128
}
126-
\Snippetify\SnippetSniffer\Common\Logger::create()->log(json_encode($snippets));
129+
127130
return $snippets;
128131
}
129132

130133
/**
131134
* Get provider.
132135
*
133-
* @return Snippetify\SnippetSniffer\Providers\ProviderInterface
136+
* @return Snippetify\SnippetSniffer\Providers\ProviderInterface
137+
* @throws RuntimeException
134138
*/
135139
private function provider(): ProviderInterface
136140
{
@@ -160,7 +164,9 @@ private function provider(): ProviderInterface
160164
/**
161165
* Get scraper.
162166
*
163-
* @return Snippetify\SnippetSniffer\Scrapers\ScraperInterface
167+
* @param string $name
168+
* @return Snippetify\SnippetSniffer\Scrapers\ScraperInterface
169+
* @throws RuntimeException
164170
*/
165171
private function scraper(string $name): ScraperInterface
166172
{

0 commit comments

Comments
 (0)