Skip to content

Commit b586d0c

Browse files
committed
Crawl result validation.
1 parent f6e024b commit b586d0c

File tree

3 files changed

+257
-0
lines changed

3 files changed

+257
-0
lines changed

capture.go

+17
Original file line numberDiff line numberDiff line change
@@ -160,6 +160,9 @@ func waitForCrawlToFinish(url string, body []byte, requestTimeout time.Duration,
160160

161161
if body != nil && !expr.Match(body) {
162162
log.WithField("url", url).WithField("wait-timeout", waitTimeout).WithField("poll-interval", pollInterval).Debugf("Detected crawl completion after %s", d)
163+
if err := checkCrawlResult(body); err != nil {
164+
return err
165+
}
163166
return nil
164167
}
165168

@@ -185,6 +188,20 @@ func waitForCrawlToFinish(url string, body []byte, requestTimeout time.Duration,
185188
return nil
186189
}
187190

191+
// checkCrawlResult searches for known archive.is errors in HTML content.
192+
func checkCrawlResult(body []byte) error {
193+
doc, err := goquery.NewDocumentFromReader(bytes.NewBuffer(body))
194+
if err != nil {
195+
return fmt.Errorf("crawl result check gq new doc: %s", err)
196+
}
197+
if block := doc.Find("html > body > div").First(); block != nil {
198+
if text := strings.Trim(block.Text(), "\r\n\t "); text == "Error: Network error." {
199+
return fmt.Errorf("archive.is crawl result: Network Error")
200+
}
201+
}
202+
return nil
203+
}
204+
188205
func doRequest(method string, url string, body io.ReadCloser, timeout time.Duration) (*http.Response, []byte, error) {
189206
req, err := newRequest(method, url, body)
190207
if err != nil {

capture_test.go

+2
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,5 @@
1+
// +build integration
2+
13
package archiveis
24

35
import (

0 commit comments

Comments
 (0)