diff --git a/pom.xml b/pom.xml index 226c851d7..68bf76d9c 100644 --- a/pom.xml +++ b/pom.xml @@ -1,7 +1,7 @@ us.codecraft - 0.7.6 + 0.8.0 4.0.0 pom @@ -14,14 +14,14 @@ 4.4 2.11.0 3.12.0 - 2.0.14.graal + 2.0.19.graal 3.0.13 31.1-jre 2.26 4.5.13 4.4.15 3.7.1 - 9.3.8.0 + 9.3.9.0 2.7.0 4.13.2 2.7.3 @@ -31,7 +31,7 @@ 1.2.0 11.4 3.141.59 - 2.0.3 + 2.0.4 4.0.0.RELEASE 0.3.5 @@ -232,7 +232,7 @@ - 3.3.9 + 3.5.0 diff --git a/webmagic-core/pom.xml b/webmagic-core/pom.xml index fe1ff12cf..997eb812c 100644 --- a/webmagic-core/pom.xml +++ b/webmagic-core/pom.xml @@ -3,7 +3,7 @@ us.codecraft webmagic-parent - 0.7.6 + 0.8.0 4.0.0 diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java b/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java index fd35f7724..9f9201ee3 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java @@ -333,9 +333,10 @@ public void run() { } } else { // wait until new url added, - if (waitNewUrl()) - //if interrupted + if (waitNewUrl()) { + //if interrupted break; + } continue; } } @@ -805,11 +806,13 @@ public Scheduler getScheduler() { * Set wait time when no url is polled.

* * @param emptySleepTime In MILLISECONDS. + * @return this */ - public void setEmptySleepTime(long emptySleepTime) { + public Spider setEmptySleepTime(long emptySleepTime) { if(emptySleepTime<=0){ throw new IllegalArgumentException("emptySleepTime should be more than zero!"); } this.emptySleepTime = emptySleepTime; + return this; } } diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java index f138b2004..72821f3c1 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java @@ -82,12 +82,16 @@ public Page download(Request request, Task task) { try { httpResponse = httpClient.execute(requestContext.getHttpUriRequest(), requestContext.getHttpClientContext()); page = handleResponse(request, request.getCharset() != null ? request.getCharset() : task.getSite().getCharset(), httpResponse, task); + onSuccess(request, task); logger.info("downloading page success {}", request.getUrl()); + return page; } catch (IOException e) { - logger.warn("download page {} error", request.getUrl(), e); + onError(request, task, e); + logger.info("download page {} error", request.getUrl(), e); + return page; } finally { if (httpResponse != null) { diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/scheduler/QueueScheduler.java b/webmagic-core/src/main/java/us/codecraft/webmagic/scheduler/QueueScheduler.java index f9ad0e98f..19d3bc732 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/scheduler/QueueScheduler.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/scheduler/QueueScheduler.java @@ -1,26 +1,51 @@ package us.codecraft.webmagic.scheduler; -import us.codecraft.webmagic.Request; -import us.codecraft.webmagic.Task; - import java.util.concurrent.BlockingQueue; import java.util.concurrent.LinkedBlockingQueue; +import us.codecraft.webmagic.Request; +import us.codecraft.webmagic.Site; +import us.codecraft.webmagic.Task; /** * Basic Scheduler implementation.
* Store urls to fetch in LinkedBlockingQueue and remove duplicate urls by HashMap. * + * Note: if you use this {@link QueueScheduler} + * with {@link Site#getCycleRetryTimes()} enabled, you may encountered dead-lock + * when the queue is full. + * * @author code4crafter@gmail.com
* @since 0.1.0 */ public class QueueScheduler extends DuplicateRemovedScheduler implements MonitorableScheduler { - private BlockingQueue queue = new LinkedBlockingQueue(); + private final BlockingQueue queue; + + public QueueScheduler() { + this.queue = new LinkedBlockingQueue<>(); + } + + /** + * Creates a {@code QueueScheduler} with the given (fixed) capacity. + * + * @param capacity the capacity of this queue, + * see {@link LinkedBlockingQueue#LinkedBlockingQueue(int)} + * @since 0.8.0 + */ + public QueueScheduler(int capacity) { + this.queue = new LinkedBlockingQueue<>(capacity); + } @Override public void pushWhenNoDuplicate(Request request, Task task) { - queue.add(request); + logger.trace("Remaining capacity: {}", this.queue.remainingCapacity()); + + try { + queue.put(request); + } catch (InterruptedException e) { + Thread.currentThread().interrupt(); + } } @Override diff --git a/webmagic-coverage/pom.xml b/webmagic-coverage/pom.xml index 289d2759f..e2c0f741c 100644 --- a/webmagic-coverage/pom.xml +++ b/webmagic-coverage/pom.xml @@ -8,7 +8,7 @@ us.codecraft webmagic-parent - 0.7.6 + 0.8.0 webmagic-coverage diff --git a/webmagic-extension/pom.xml b/webmagic-extension/pom.xml index fc5d9b761..05d6100a6 100644 --- a/webmagic-extension/pom.xml +++ b/webmagic-extension/pom.xml @@ -3,7 +3,7 @@ us.codecraft webmagic-parent - 0.7.6 + 0.8.0 4.0.0 diff --git a/webmagic-samples/pom.xml b/webmagic-samples/pom.xml index c726c07b1..449fcf243 100644 --- a/webmagic-samples/pom.xml +++ b/webmagic-samples/pom.xml @@ -3,7 +3,7 @@ webmagic-parent us.codecraft - 0.7.6 + 0.8.0 4.0.0 diff --git a/webmagic-saxon/pom.xml b/webmagic-saxon/pom.xml index 893fc0b7b..b73f6fd27 100644 --- a/webmagic-saxon/pom.xml +++ b/webmagic-saxon/pom.xml @@ -3,7 +3,7 @@ webmagic-parent us.codecraft - 0.7.6 + 0.8.0 4.0.0 diff --git a/webmagic-scripts/pom.xml b/webmagic-scripts/pom.xml index 0022a43b3..3ec15f9af 100644 --- a/webmagic-scripts/pom.xml +++ b/webmagic-scripts/pom.xml @@ -3,7 +3,7 @@ webmagic-parent us.codecraft - 0.7.6 + 0.8.0 4.0.0 diff --git a/webmagic-selenium/pom.xml b/webmagic-selenium/pom.xml index 36ded0005..715d7731b 100644 --- a/webmagic-selenium/pom.xml +++ b/webmagic-selenium/pom.xml @@ -3,7 +3,7 @@ webmagic-parent us.codecraft - 0.7.6 + 0.8.0 4.0.0