From 2fe8d923dd2deab95318d2658200e9dae3cf64d5 Mon Sep 17 00:00:00 2001 From: Julien Nioche Date: Mon, 21 Oct 2024 13:34:25 +0100 Subject: [PATCH] Bugfix nofollow instructions in rel tags ignored when the tag has more than one value (#1362) Signed-off-by: Julien Nioche --- .../java/org/apache/stormcrawler/bolt/JSoupParserBolt.java | 6 +++++- core/src/test/resources/digitalpebble.com.html | 1 + 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/core/src/main/java/org/apache/stormcrawler/bolt/JSoupParserBolt.java b/core/src/main/java/org/apache/stormcrawler/bolt/JSoupParserBolt.java index 5c828bc1a..015403d06 100644 --- a/core/src/main/java/org/apache/stormcrawler/bolt/JSoupParserBolt.java +++ b/core/src/main/java/org/apache/stormcrawler/bolt/JSoupParserBolt.java @@ -30,6 +30,7 @@ import java.util.List; import java.util.Locale; import java.util.Map; +import java.util.stream.Stream; import org.apache.commons.lang.StringUtils; import org.apache.storm.metric.api.MultiCountMetric; import org.apache.storm.task.OutputCollector; @@ -269,7 +270,10 @@ public void execute(Tuple tuple) { final URL baseURL = new URL(url); for (Element link : links) { // nofollow - boolean noFollow = "nofollow".equalsIgnoreCase(link.attr("rel")); + String[] relkeywords = link.attr("rel").split(" "); + boolean noFollow = + Stream.of(relkeywords).anyMatch(x -> x.equalsIgnoreCase("nofollow")); + // remove altogether if (noFollow && robots_noFollow_strict) { continue; diff --git a/core/src/test/resources/digitalpebble.com.html b/core/src/test/resources/digitalpebble.com.html index 8b1de6881..683321640 100644 --- a/core/src/test/resources/digitalpebble.com.html +++ b/core/src/test/resources/digitalpebble.com.html @@ -50,6 +50,7 @@ +