Skip to content

Commit 939f76e

Browse files
committed
Enhancement: Added snippets_per_page filter
1 parent ec7b185 commit 939f76e

7 files changed

+87
-21
lines changed

src/Observers/SnippetCrawlObserver.php

+3-2
Original file line numberDiff line numberDiff line change
@@ -52,7 +52,7 @@ public function crawled(UriInterface $url, ResponseInterface $response, ?UriInte
5252
if ($this->webCrawler->isCrawled($url)) return;
5353

5454
// Create crawler from reponse body
55-
$crawler = new Crawler((string) $response->getBody());
55+
$crawler = new Crawler((string) $response->getBody(), (string) $url);
5656

5757
// Must contains snippets
5858
if (0 === $crawler->filter($this->webCrawler->getConfig()['html_tags']['snippet'])->count()) return;
@@ -63,7 +63,8 @@ public function crawled(UriInterface $url, ResponseInterface $response, ?UriInte
6363
// New meta snippet
6464
$metaSnippet = new MetaSnippetCollection([
6565
'uri' => (string) $url,
66-
'snippets' => $this->webCrawler->getScraper($url->getHost())->fetchFromDocument($crawler)
66+
'snippets' => $this->webCrawler->getScraper($url->getHost())
67+
->fetchFromDocument($crawler, $this->webCrawler->getMeta(), $url)
6768
]);
6869

6970
// Must contains snippets

src/Scrapers/AbstractScraper.php

+44-10
Original file line numberDiff line numberDiff line change
@@ -80,7 +80,7 @@ public function __construct(array $config = [])
8080
*/
8181
public function fetch(UriInterface $uri, array $options = []): array
8282
{
83-
$this->fetchFromDocument($this->getCrawler($uri), $options);
83+
$this->fetchFromDocument($this->getCrawler($uri), $options, $uri);
8484

8585
return $this->snippets;
8686
}
@@ -90,19 +90,20 @@ public function fetch(UriInterface $uri, array $options = []): array
9090
*
9191
* @param string|Symfony\Component\DomCrawler\Crawler $document
9292
* @param array $options
93+
* @param Psr\Http\Message\UriInterface $uri
9394
* @return Snippetify\SnippetSniffer\Common\Snippet[]
9495
*/
95-
public function fetchFromDocument($document, array $options = []): array
96+
public function fetchFromDocument($document, array $options = [], ?UriInterface $uri = null): array
9697
{
97-
$crawler = $document instanceof Crawler ? $document : new Crawler($document);
98+
$crawler = $document instanceof Crawler ? $document : new Crawler($document, $uri);
9899

99100
try {
100101

101102
$htmlTags = explode(',', $this->config['html_tags']['snippet']);
102103

103104
foreach ($htmlTags as $value) {
104-
$crawler->filter($value)->each(function ($node) use ($crawler) {
105-
$this->hydrateSnippets($node, $crawler);
105+
$crawler->filter($value)->each(function ($node) use ($crawler, $options) {
106+
$this->hydrateSnippets($node, $crawler, $options);
106107
});
107108
}
108109

@@ -134,24 +135,25 @@ protected function getCrawler(UriInterface $uri): Crawler
134135
*/
135136
protected function hydrateSnippets(Crawler $node, Crawler $crawler, array $meta = []): void
136137
{
137-
if ($this->containsSnippet($this->snippets, $node)) return;
138+
if ($this->containsSnippet($node)) return;
139+
140+
if ($this->hasMoreSnippetsPerPage($crawler, $meta)) return;
138141

139142
if ($snippet = $this->fetchSnippet($node, $crawler, $meta)) $this->snippets[] = $snippet;
140143
}
141144

142145
/**
143146
* Contains snippet.
144147
*
145-
* @param Snippetify\SnippetSniffer\Common\Snippet[] $snippets
146148
* @param Symfony\Component\DomCrawler\Crawler $node
147149
* @return bool
148150
*/
149-
protected function containsSnippet(array $snippets, Crawler $node): bool
151+
protected function containsSnippet(Crawler $node): bool
150152
{
151153
$has = false;
152154

153155
try {
154-
foreach ($snippets as $snippet) {
156+
foreach ($this->snippets as $snippet) {
155157
if ($snippet->code == $node->text()) {
156158
$has = true;
157159
break;
@@ -164,6 +166,37 @@ protected function containsSnippet(array $snippets, Crawler $node): bool
164166
return $has;
165167
}
166168

169+
/**
170+
* Has more snippets per page.
171+
*
172+
* @param Symfony\Component\DomCrawler\Crawler $crawler
173+
* @param array $meta
174+
* @return bool
175+
*/
176+
protected function hasMoreSnippetsPerPage(Crawler $crawler, array $meta): bool
177+
{
178+
if (empty($meta['snippets_per_page'])) return false;
179+
180+
return $meta['snippets_per_page'] <= $this->countRetrievedSnippetsPerPage($crawler);
181+
}
182+
183+
/**
184+
* Count retrieved snippets per page.
185+
*
186+
* @param Symfony\Component\DomCrawler\Crawler $crawler
187+
* @return int
188+
*/
189+
protected function countRetrievedSnippetsPerPage(Crawler $crawler): int
190+
{
191+
$count = 0;
192+
193+
foreach ($this->snippets as $snippet) {
194+
if ($crawler->getUri() === $snippet->meta['url']) $count++;
195+
}
196+
197+
return $count;
198+
}
199+
167200
/**
168201
* Fetch snippet.
169202
*
@@ -224,6 +257,7 @@ protected function fetchTags(Crawler $node): array
224257
*/
225258
protected function fetchWebsiteMetadata(Crawler $crawler): array
226259
{
260+
$url = new Uri($crawler->getUri());
227261
$title = $crawler->filter('title')->text();
228262
$siteIcon = $crawler->filter('link[rel="icon"]');
229263
$ogImage = $crawler->filter('meta[property="og:image"]');
@@ -251,7 +285,7 @@ protected function fetchWebsiteMetadata(Crawler $crawler): array
251285
return [
252286
'name' => $name,
253287
'brand' => $brand,
254-
'url' => (new Uri($crawler->getUri()))->getHost(),
288+
'url' => $url->getScheme() . '://' . $url->getHost(),
255289
];
256290
}
257291

src/Scrapers/ScraperInterface.php

+1-1
Original file line numberDiff line numberDiff line change
@@ -17,5 +17,5 @@ interface ScraperInterface
1717
{
1818
public function fetch(UriInterface $uri, array $options = []): array;
1919

20-
public function fetchFromDocument($document, array $options = []): array;
20+
public function fetchFromDocument($document, array $options = [], ?UriInterface $uri = null): array;
2121
}

src/Scrapers/StackoverflowScraper.php

+9-7
Original file line numberDiff line numberDiff line change
@@ -26,7 +26,7 @@ final class StackoverflowScraper extends AbstractScraper
2626
*/
2727
public function fetch(UriInterface $uri, array $options = []): array
2828
{
29-
$this->fetchFromDocument($this->getCrawler($uri), $options);
29+
$this->fetchFromDocument($this->getCrawler($uri), $options, $uri);
3030

3131
return $this->snippets;
3232
}
@@ -36,23 +36,25 @@ public function fetch(UriInterface $uri, array $options = []): array
3636
*
3737
* @param string|Symfony\Component\DomCrawler\Crawler $document
3838
* @param array $options
39+
* @param Psr\Http\Message\UriInterface $uri
3940
* @return Snippetify\SnippetSniffer\Common\Snippet[]
4041
*/
41-
public function fetchFromDocument($document, array $options = []): array
42+
public function fetchFromDocument($document, array $options = [], ?UriInterface $uri = null): array
4243
{
43-
$crawler = $document instanceof Crawler ? $document : new Crawler($document);
44+
$crawler = $document instanceof Crawler ? $document : new Crawler($document, $uri);
4445

4546
try {
4647
$crawler->filter('#answers .answer')->each(function ($node) use ($crawler, $options) {
4748

4849
if (($accepted = strpos($node->attr('class'), 'accepted') !== false) === false && // Only accepted snippets
4950
isset($options['only_accepted']) && $options['only_accepted'] === true) return;
5051

51-
$meta = ['accepted' => $accepted];
52+
$options['accepted'] = $accepted;
5253

53-
$node->filter('pre')->each(function ($node) use ($crawler, $meta) {
54-
if ($this->containsSnippet($this->snippets, $node->filter('code'))) return;
55-
if ($snippet = $this->fetchSnippet($node, $crawler, $meta)) $this->snippets[] = $snippet;
54+
$node->filter('pre')->each(function ($node) use ($crawler, $options) {
55+
if ($this->containsSnippet($node->filter('code'))) return;
56+
if ($this->hasMoreSnippetsPerPage($crawler, $options)) return;
57+
if ($snippet = $this->fetchSnippet($node, $crawler, $options)) $this->snippets[] = $snippet;
5658
});
5759
});
5860
} catch (\Exception $e) {

src/SnippetSniffer.php

+1-1
Original file line numberDiff line numberDiff line change
@@ -124,7 +124,7 @@ public function fetch(string $query, array $meta = []): array
124124
$urls = $this->provider()->fetch($query, $meta);
125125

126126
foreach ($urls as $url) {
127-
$snippets = array_merge($snippets, $this->scraper($url->getHost())->fetch($url));
127+
$snippets = array_merge($snippets, $this->scraper($url->getHost())->fetch($url, $meta));
128128
}
129129

130130
return $snippets;

src/WebCrawler.php

+8
Original file line numberDiff line numberDiff line change
@@ -126,6 +126,14 @@ public static function create(array $config = []): self
126126
return self::$instance;
127127
}
128128

129+
/**
130+
* @return array
131+
*/
132+
public function getMeta(): array
133+
{
134+
return $this->meta;
135+
}
136+
129137
/**
130138
* @return array
131139
*/

tests/SnippetSnifferTest.php

+21
Original file line numberDiff line numberDiff line change
@@ -183,6 +183,27 @@ public function testContainsResults()
183183
$this->assertGreaterThan(0, count($data));
184184
}
185185

186+
public function testSnippetsPerPage()
187+
{
188+
$perPage = 5;
189+
$data = $this->sniffer->fetch('js array contains', [ 'snippets_per_page' => $perPage, 'page' => 1, 'limit' => 10 ]);
190+
$has = true;
191+
$pages = [];
192+
193+
foreach ($data as $snippet) {
194+
$pages[$snippet->meta['url']] = ($pages[$snippet->meta['url']] ?? 0) + 1;
195+
}
196+
197+
foreach ($pages as $uri => $page) {
198+
if ($page > $perPage) {
199+
$has = false;
200+
break;
201+
}
202+
}
203+
204+
$this->assertTrue($has);
205+
}
206+
186207
public function testAddScraper()
187208
{
188209
$data = $this->sniffer

0 commit comments

Comments
 (0)