From 524cdf2cecae7f5b7e928fe3697d46b2bd8b9aa7 Mon Sep 17 00:00:00 2001 From: Ivan Kozik Date: Fri, 21 Aug 2015 05:35:30 +0000 Subject: [PATCH] Fix blogspot search? ignore --- libgrabsite/ignore_sets/global | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/libgrabsite/ignore_sets/global b/libgrabsite/ignore_sets/global index da9008e..7ee0b55 100644 --- a/libgrabsite/ignore_sets/global +++ b/libgrabsite/ignore_sets/global @@ -253,10 +253,10 @@ ^https?://r-login\.wordpress\.com/remote-login\.php ^https?://{primary_netloc}/wp-login\.php\? -# Ignore /search.*updated-max= blogspot pagination because all posts are -# crawled anyway via the _archive.html pages. Need to ignore on all -# domains because blogspot also runs on non-blogspot.com domains. -^https?://{primary_netloc}/search(/label/[^\?]+|)\?updated-max=\d{4}-\d\d-\d\dT\d\d:\d\d:\d\d.*&max-results=\d+ +# Ignore /search.*updated-(min|max)= blogspot pagination because all posts are +# crawled anyway via the _archive.html pages. Need to ignore on all domains +# because blogspot also runs on non-blogspot.com domains. +^https?://{primary_netloc}/search(/label/[^\?]+|)\?updated-(min|max)=\d{4}-\d\d-\d\dT\d\d:\d\d:\d\d.*&max-results=\d+ # Ignore bogus /CSI/ links on blogspot.com ^https?://.+\.blogspot\.(com|in|com\.au|co\.uk|jp|co\.nz|ca|de|it|fr|se|sg|es|pt|com\.br|ar|mx|kr)/\d{4}/\d{2}/CSI/$