From e5be686b06ddeda6a5d0fd3d18521930ae6a5674 Mon Sep 17 00:00:00 2001 From: TobiGr Date: Mon, 2 Jan 2023 18:59:03 +0100 Subject: [PATCH] Try to fix detecting replies to a comment on the previous page When getting a page which is not the initial page there it is possible that the first comments are replies to a comment from a previous page. --- .../org/schabi/newpipe/extractor/Page.java | 2 +- .../SoundcloudCommentsExtractor.java | 103 +++++++++++++++--- .../SoundcloudCommentsInfoItemExtractor.java | 54 ++++----- 3 files changed, 115 insertions(+), 44 deletions(-) diff --git a/extractor/src/main/java/org/schabi/newpipe/extractor/Page.java b/extractor/src/main/java/org/schabi/newpipe/extractor/Page.java index e13a922878..091c5e7675 100644 --- a/extractor/src/main/java/org/schabi/newpipe/extractor/Page.java +++ b/extractor/src/main/java/org/schabi/newpipe/extractor/Page.java @@ -13,7 +13,7 @@ *
* A page has an {@link #id}, an {@link #url}, as well as information on possible {@link #cookies}. * In case the data behind the URL has already been retrieved, - * it can be accessed by using @link #getBody()} and {@link #getContent()}. + * it can be accessed by using {@link #getBody()} or {@link #getContent()}. */ public class Page implements Serializable { private final String url; diff --git a/extractor/src/main/java/org/schabi/newpipe/extractor/services/soundcloud/extractors/SoundcloudCommentsExtractor.java b/extractor/src/main/java/org/schabi/newpipe/extractor/services/soundcloud/extractors/SoundcloudCommentsExtractor.java index f253cb695b..56af6b43eb 100644 --- a/extractor/src/main/java/org/schabi/newpipe/extractor/services/soundcloud/extractors/SoundcloudCommentsExtractor.java +++ b/extractor/src/main/java/org/schabi/newpipe/extractor/services/soundcloud/extractors/SoundcloudCommentsExtractor.java @@ -21,13 +21,24 @@ import org.schabi.newpipe.extractor.services.soundcloud.SoundcloudParsingHelper; import java.io.IOException; +import java.util.ArrayList; +import java.util.List; import javax.annotation.Nonnull; +import javax.annotation.Nullable; public class SoundcloudCommentsExtractor extends CommentsExtractor { public static final String COLLECTION = "collection"; public static final String NEXT_HREF = "next_href"; + /** + * The last comment which was a top level comment. + * Next pages might start with replies to the last top level comment + * and therefore the {@link SoundcloudCommentsInfoItemExtractor#replyCount} + * of the last top level comment cannot be determined certainly. + */ + @Nullable private JsonObject lastTopLevelComment; + public SoundcloudCommentsExtractor(final StreamingService service, final ListLinkHandler uiHandler) { super(service, uiHandler); @@ -50,14 +61,15 @@ public InfoItemsPage getInitialPage() throws ExtractionExcepti final CommentsInfoItemsCollector collector = new CommentsInfoItemsCollector( getServiceId()); - collectCommentsFrom(collector, json); + collectCommentsFrom(collector, json, null); return new InfoItemsPage<>(collector, new Page(json.getString(NEXT_HREF))); } @Override - public InfoItemsPage getPage(final Page page) throws ExtractionException, - IOException { + public InfoItemsPage getPage(final Page page) + throws ExtractionException, IOException { + if (page == null || isNullOrEmpty(page.getUrl())) { throw new IllegalArgumentException("Page doesn't contain an URL"); } @@ -88,7 +100,7 @@ public InfoItemsPage getPage(final Page page) throws Extractio } catch (final JsonParserException e) { throw new ParsingException("Could not parse json", e); } - collectCommentsFrom(collector, json); + collectCommentsFrom(collector, json, lastTopLevelComment); } if (hasNextPage) { @@ -101,27 +113,86 @@ public InfoItemsPage getPage(final Page page) throws Extractio @Override public void onFetchPage(@Nonnull final Downloader downloader) { } - private void collectCommentsFrom(final CommentsInfoItemsCollector collector, - final JsonObject json) throws ParsingException { + /** + * Collect top level comments from a SoundCloud API response. + * @param collector the collector which collects the the top level comments + * @param json the JsonObject of the API response + * @param lastTopLevelComment the last top level comment from the previous page or {@code null} + * if this method is run for the initial page. + * @throws ParsingException + */ + private void collectCommentsFrom(@Nonnull final CommentsInfoItemsCollector collector, + @Nonnull final JsonObject json, + @Nullable final JsonObject lastTopLevelComment) + throws ParsingException { + final List extractors = new ArrayList<>(); final String url = getUrl(); final JsonArray entries = json.getArray(COLLECTION); - JsonObject lastTopComment = null; + /** + * The current top level comment. + */ + JsonObject currentTopLevelComment = null; + boolean isLastCommentReply = true; + // Check whether the first comment in the list is a reply to the last top level comment + // from the previous page if there was a previous page. + if (lastTopLevelComment != null) { + final JsonObject firstComment = entries.getObject(0); + if (SoundcloudParsingHelper.isReplyTo(lastTopLevelComment, firstComment)) { + currentTopLevelComment = lastTopLevelComment; + } else { + extractors.add(new SoundcloudCommentsInfoItemExtractor( + json, SoundcloudCommentsInfoItemExtractor.PREVIOUS_PAGE_INDEX, + firstComment, url, null)); + } + } + for (int i = 0; i < entries.size(); i++) { final JsonObject entry = entries.getObject(i); - if (i == 0 + // extract all top level comments + // The first comment is either a top level comment + // if it is not a reply to the last top level comment + // + if (i == 0 && currentTopLevelComment == null || (!SoundcloudParsingHelper.isReplyTo(entries.getObject(i - 1), entry) - && !SoundcloudParsingHelper.isReplyTo(lastTopComment, entry))) { - lastTopComment = entry; - collector.commit(new SoundcloudCommentsInfoItemExtractor( - json, i, entry, url)); + && !SoundcloudParsingHelper.isReplyTo(currentTopLevelComment, entry))) { + currentTopLevelComment = entry; + if (i == entries.size() - 1) { + isLastCommentReply = false; + this.lastTopLevelComment = currentTopLevelComment; + // Do not collect the last comment if it is a top level comment + // because it might have replies. + // That is information we cannot get from the comment itself + // (thanks SoundCloud...) but needs to be obtained from the next comment. + // The comment will therefore be collected + // when collecting the items from the next page. + break; + } + extractors.add(new SoundcloudCommentsInfoItemExtractor( + json, i, entry, url, lastTopLevelComment)); } } + if (isLastCommentReply) { + // Do not collect the last top level comment if it has replies and the retrieved + // comment list ends with a reply. We do not know whether the next page starts + // with more replies to the last top level comment. + this.lastTopLevelComment = extractors.remove(extractors.size() - 1).item; + } + extractors.stream().forEach(collector::commit); + } - private boolean collectRepliesFrom(final CommentsInfoItemsCollector collector, - final JsonObject json, - final int id, - final String url) { + /** + * Collect replies to a top level comment from a SoundCloud API response. + * @param collector the collector which collects the the replies + * @param json the SoundCloud API response + * @param id the comment's id for which the replies are collected + * @param url the corresponding page's URL + * @return + */ + private boolean collectRepliesFrom(@Nonnull final CommentsInfoItemsCollector collector, + @Nonnull final JsonObject json, + final int id, + @Nonnull final String url) { JsonObject originalComment = null; final JsonArray entries = json.getArray(COLLECTION); boolean moreReplies = false; diff --git a/extractor/src/main/java/org/schabi/newpipe/extractor/services/soundcloud/extractors/SoundcloudCommentsInfoItemExtractor.java b/extractor/src/main/java/org/schabi/newpipe/extractor/services/soundcloud/extractors/SoundcloudCommentsInfoItemExtractor.java index db9ef549dd..0dfa3edeee 100644 --- a/extractor/src/main/java/org/schabi/newpipe/extractor/services/soundcloud/extractors/SoundcloudCommentsInfoItemExtractor.java +++ b/extractor/src/main/java/org/schabi/newpipe/extractor/services/soundcloud/extractors/SoundcloudCommentsInfoItemExtractor.java @@ -6,10 +6,8 @@ import com.grack.nanojson.JsonObject; import org.schabi.newpipe.extractor.Page; -import org.schabi.newpipe.extractor.ServiceList; import org.schabi.newpipe.extractor.comments.CommentsInfoItem; import org.schabi.newpipe.extractor.comments.CommentsInfoItemExtractor; -import org.schabi.newpipe.extractor.comments.CommentsInfoItemsCollector; import org.schabi.newpipe.extractor.exceptions.ParsingException; import org.schabi.newpipe.extractor.localization.DateWrapper; import org.schabi.newpipe.extractor.services.soundcloud.SoundcloudParsingHelper; @@ -17,32 +15,42 @@ import java.util.Objects; +import javax.annotation.Nonnull; import javax.annotation.Nullable; public class SoundcloudCommentsInfoItemExtractor implements CommentsInfoItemExtractor { + public static final int PREVIOUS_PAGE_INDEX = -1; public static final String BODY = "body"; public static final String USER_PERMALINK = "permalink"; public static final String USER_FULL_NAME = "full_name"; public static final String USER_USERNAME = "username"; - private final JsonObject json; + @Nonnull private final JsonObject json; private final int index; - private final JsonObject item; + @Nonnull public final JsonObject item; private final String url; - private final JsonObject user; - private final JsonObject superComment; - + @Nonnull private final JsonObject user; + /** + * A comment to which this comment is a reply. + * Is {@code null} if this comment is itself a top level comment. + */ + @Nullable private final JsonObject topLevelComment; + + /** + * The reply count is not given by the SoundCloud API, but needs to be obtained + * by counting the comments which come directly after this item and have the same timestamp. + */ private int replyCount = CommentsInfoItem.UNKNOWN_REPLY_COUNT; private Page repliesPage = null; - public SoundcloudCommentsInfoItemExtractor(final JsonObject json, final int index, - final JsonObject item, final String url, - @Nullable final JsonObject superComment) { + public SoundcloudCommentsInfoItemExtractor(@Nonnull final JsonObject json, final int index, + @Nonnull final JsonObject item, final String url, + @Nullable final JsonObject topLevelComment) { this.json = json; this.index = index; this.item = item; this.url = url; - this.superComment = superComment; + this.topLevelComment = topLevelComment; this.user = item.getObject("user"); } @@ -58,7 +66,7 @@ public String getCommentId() { @Override public Description getCommentText() { String commentContent = item.getString(BODY); - if (superComment == null) { + if (topLevelComment == null) { return new Description(commentContent, Description.PLAIN_TEXT); } // This comment is a reply to another comment. @@ -78,7 +86,7 @@ public Description getCommentText() { } } if (author == null) { - author = superComment.getObject("user"); + author = topLevelComment.getObject("user"); } final String name = isNullOrEmpty(author.getString(USER_FULL_NAME)) ? author.getString(USER_USERNAME) : author.getString(USER_FULL_NAME); @@ -149,24 +157,17 @@ public String getThumbnailUrl() { @Override public Page getReplies() { if (replyCount == CommentsInfoItem.UNKNOWN_REPLY_COUNT) { - final JsonArray replies = new JsonArray(); - final CommentsInfoItemsCollector collector = new CommentsInfoItemsCollector( - ServiceList.SoundCloud.getServiceId()); + replyCount = 0; // SoundCloud has only comments and top level replies, but not nested replies. // Therefore, replies cannot have further replies. - if (superComment == null) { + if (topLevelComment == null) { // Loop through all comments which come after the original comment // to find its replies. final JsonArray allItems = json.getArray(SoundcloudCommentsExtractor.COLLECTION); - boolean foundReply = false; for (int i = index + 1; i < allItems.size(); i++) { - final JsonObject comment = allItems.getObject(i); - if (SoundcloudParsingHelper.isReplyTo(item, comment)) { - replies.add(comment); - collector.commit(new SoundcloudCommentsInfoItemExtractor( - json, i, comment, url, item)); - foundReply = true; - } else if (foundReply) { + if (SoundcloudParsingHelper.isReplyTo(item, allItems.getObject(i))) { + replyCount++; + } else { // Only the comments directly after the original comment // having the same timestamp are replies to the original comment. // The first comment not having the same timestamp @@ -175,8 +176,7 @@ public Page getReplies() { } } } - replyCount = replies.size(); - if (collector.getItems().isEmpty()) { + if (replyCount == 0) { return null; } repliesPage = new Page(getUrl(), getCommentId());