From 395b3c4b9af64fed1cccc0db107fda958003f317 Mon Sep 17 00:00:00 2001 From: Bagas Wastu <67826350+bgwastu@users.noreply.github.com> Date: Sat, 13 Jan 2024 12:47:39 +0700 Subject: [PATCH 1/4] feat: add url cleaner feature --- .../BotUpdate/Message/MessageUpdateHandler.cs | 54 +++++- BotNet.Services/CleanUrl/Rule.cs | 160 ++++++++++++++++++ BotNet.Services/CleanUrl/UrlCleaner.cs | 26 +++ .../Services/CleanUrl/UrlCleanerTests.cs | 19 +++ 4 files changed, 251 insertions(+), 8 deletions(-) create mode 100644 BotNet.Services/CleanUrl/Rule.cs create mode 100644 BotNet.Services/CleanUrl/UrlCleaner.cs create mode 100644 BotNet.Tests/Services/CleanUrl/UrlCleanerTests.cs diff --git a/BotNet.CommandHandlers/BotUpdate/Message/MessageUpdateHandler.cs b/BotNet.CommandHandlers/BotUpdate/Message/MessageUpdateHandler.cs index 20b5745..86d0913 100644 --- a/BotNet.CommandHandlers/BotUpdate/Message/MessageUpdateHandler.cs +++ b/BotNet.CommandHandlers/BotUpdate/Message/MessageUpdateHandler.cs @@ -1,8 +1,10 @@ -using BotNet.Commands; +using System.Text.RegularExpressions; +using BotNet.Commands; using BotNet.Commands.BotUpdate.Message; using BotNet.Commands.CommandPrioritization; using BotNet.Services.BotProfile; using BotNet.Services.SocialLink; +using BotNet.Services.UrlCleaner; using RG.Ninja; using Telegram.Bot; using Telegram.Bot.Types.Enums; @@ -42,16 +44,17 @@ await _commandQueue.DispatchAsync( return; } - // Handle Social Link (better preview) if ((update.Message.Text ?? update.Message.Caption) is { } textOrCaption) { - IEnumerable possibleUrls = SocialLinkEmbedFixer.GetPossibleUrls(textOrCaption); - if (possibleUrls.Any()) { + // Handle Social Link (better preview) + IEnumerable possibleSocialUrls = SocialLinkEmbedFixer.GetPossibleUrls(textOrCaption); + if (possibleSocialUrls.Any()) { // Fire and forget Task _ = Task.Run(async () => { try { - foreach (Uri url in possibleUrls) { - Uri fixedUrl = SocialLinkEmbedFixer.Fix(url); + foreach (Uri url in possibleSocialUrls) { + Uri cleanedUrl = UrlCleaner.Clean(url); + Uri fixedUrl = SocialLinkEmbedFixer.Fix(cleanedUrl); await _telegramBotClient.SendTextMessageAsync( chatId: update.Message.Chat.Id, text: $"Preview: {fixedUrl.OriginalString}", @@ -65,6 +68,39 @@ await _telegramBotClient.SendTextMessageAsync( }); return; } + + // get list of urls from message (start with http or https or www) + string pattern = @"(?i)\b((?:https?://|www\.)\S+)\b"; + MatchCollection matches = Regex.Matches(textOrCaption, pattern); + List urls = matches.Select(m => m.Value).ToList(); + + // Clean the url + if (urls.Count > 0) { + // Fire and forget + Task _ = Task.Run(async () => { + try { + foreach (string url in urls) { + Uri cleanedUrl = UrlCleaner.Clean(new Uri(url)); + + // if the url is same, don't send the message + if (cleanedUrl.OriginalString == new Uri(url).OriginalString) { + continue; + } + + await _telegramBotClient.SendTextMessageAsync( + chatId: update.Message.Chat.Id, + text: $"Cleaned URL: {cleanedUrl.OriginalString}", + replyToMessageId: update.Message.MessageId, + cancellationToken: cancellationToken + ); + } + } catch (OperationCanceledException) { + // Terminate gracefully + } + }); + return; + } + } // Handle reddit mirroring @@ -74,7 +110,8 @@ await _telegramBotClient.SendTextMessageAsync( Offset: var offset, Length: var length } && update.Message.Text?.Substring(offset, length) is { } url - && url.StartsWith("https://www.reddit.com/", out string? remainingUrl)) { + && UrlCleaner.Clean(new Uri(url)) is { } cleanedUrl + && cleanedUrl.ToString().StartsWith("https://www.reddit.com/", out string? remainingUrl)) { // Fire and forget Task _ = Task.Run(async () => { try { @@ -93,7 +130,8 @@ await _telegramBotClient.SendTextMessageAsync( } else if (update.Message?.Entities?.FirstOrDefault(entity => entity is { Type: MessageEntityType.TextLink }) is { Url: { } textUrl } - && textUrl.StartsWith("https://www.reddit.com/", out string? remainingTextUrl)) { + && UrlCleaner.Clean(new Uri(textUrl)) is { } cleanedTextUrl + && cleanedTextUrl.ToString().StartsWith("https://www.reddit.com/", out string? remainingTextUrl)) { // Fire and forget Task _ = Task.Run(async () => { try { diff --git a/BotNet.Services/CleanUrl/Rule.cs b/BotNet.Services/CleanUrl/Rule.cs new file mode 100644 index 0000000..0a7bd13 --- /dev/null +++ b/BotNet.Services/CleanUrl/Rule.cs @@ -0,0 +1,160 @@ +using System.Collections.Generic; +using System.Text.RegularExpressions; + +namespace BotNet.Services.UrlCleaner { + public class Rule { + public required string Name { get; set; } + public required Regex Match { get; set; } + public required List Rules { get; set; } + public List? Replace { get; set; } + + } + + public static class RuleData { + /// + /// Represents a list of rules for cleaning URLs. + /// + public static List Rules = [ + new Rule + { + Name = "Global", + Match = new Regex("./*"), + Rules = + [ + // https://en.wikipedia.org/wiki/UTM_parameters + "utm_source", "utm_medium", "utm_term", "utm_campaign", + "utm_content", "utm_name", "utm_cid", "utm_reader", "utm_viz_id", + "utm_pubreferrer", "utm_swu", "utm_social-type", "utm_brand", + "utm_team", "utm_feeditemid", "utm_id", "utm_marketing_tactic", + "utm_creative_format", "utm_campaign_id", "utm_source_platform", + "utm_timestamp", "utm_souce", + // ITM parameters, a variant of UTM parameters + "itm_source", "itm_medium", "itm_term", "itm_campaign", "itm_content", + "itm_channel", "itm_source_s", "itm_medium_s", "itm_campaign_s", + "itm_audience", + // INT parameters, another variant of UTM + "int_source", "int_cmp_name", "int_cmp_id", "int_cmp_creative", + "int_medium", "int_campaign", + // piwik + "pk_campaign", "pk_cpn", "pk_source", "pk_medium", + "pk_keyword", "pk_kwd", "pk_content", "pk_cid", + "piwik_campaign", "piwik_cpn", "piwik_source", "piwik_medium", + "piwik_keyword", "piwik_kwd", "piwik_content", "piwik_cid", + // Google Ads + "gclid", "ga_source", "ga_medium", "ga_term", "ga_content", "ga_campaign", + "ga_place", "gclid", "gclsrc", + // hhsa + "hsa_cam", "hsa_grp", "hsa_mt", "hsa_src", "hsa_ad", "hsa_acc", + "hsa_net", "hsa_kw", "hsa_tgt", "hsa_ver", "hsa_la", "hsa_ol", + // Facebook + "fbclid", + // Olytics + "oly_enc_id", "oly_anon_id", + // Vero + "vero_id", "vero_conv", + // Drip + "__s", + // HubSpot + "_hsenc", "_hsmi", "__hssc", "__hstc", "__hsfp", "hsCtaTracking", + // Marketo + "mkt_tok", + // Matomo + "mtm_campaign", "mtm_keyword", "mtm_kwd", "mtm_source", "mtm_medium", + "mtm_content", "mtm_cid", "mtm_group", "mtm_placement", + // Oracle Eloqua + "elqTrackId", "elq", "elqaid", "elqat", "elqCampaignId", "elqTrack", + // MailChimp + "mc_cid", "mc_eid", + // Other + "ncid", "cmpid", "mbid", + // Reddit Ads + "rdt_cid" + ] + }, + new Rule + { + Name = "audible.com", + Match = new Regex("www.audible.com", RegexOptions.IgnoreCase), + Rules = ["qid", "sr", "pf_rd_p", "pf_rd_r", "plink", "ref"] + }, + new Rule + { + Name = "reddit.com", + Match = new Regex(@".*\.reddit\.com", RegexOptions.IgnoreCase), + Rules = + [ + "ref_campaign", "ref_source", "tags", "keyword", "channel", "campaign", + "user_agent", "domain", "base_url", "$android_deeplink_path", + "$deeplink_path", "$og_redirect", "share_id", "correlation_id", "$deep_link", "post_index", "ref", "_branch_match_id", "post_fullname", "$3p", "_branch_referrer" + ] + }, + new Rule + { + Name = "facebook.com", + Match = new Regex(@".*\.facebook\.com", RegexOptions.IgnoreCase), + Rules = + [ + "fbclid", "fb_ref", "fb_source", "referral_code", "referral_story_type", "tracking", "ref", "mibextid", "app" + ], + }, + new Rule + { + Name = "shopee.com", + Match = new Regex(@"^(?:https?:\/\/)?(?:[^.]+\.)?shopee\.[a-z0-9]{0,3}", RegexOptions.IgnoreCase), + Rules = + [ + "af_siteid", "pid", "af_click_lookback", "af_viewthrough_lookback", + "is_retargeting", "af_reengagement_window", "af_sub_siteid", "c" + ] + }, + new Rule + { + Name = "instagram.com", + Match = new Regex(@"^(?:https?:\/\/)?(?:[^.]+\.)?instagram\.com", RegexOptions.IgnoreCase), + Rules = ["igshid", "source"], + }, + new Rule + { + Name = "twitter.com or x.com", + Match = new Regex("(twitter.com|x.com)", RegexOptions.IgnoreCase), + Rules = ["s", "src", "ref_url", "ref_src"] + }, + new Rule + { + Name = "youtube.com", + Match = new Regex(@".*\.youtube\.com", RegexOptions.IgnoreCase), + Rules = ["gclid", "feature", "app", "src", "lId", "cId", "embeds_referring_euri"], + }, + new Rule + { + Name = "discord.com", + Match = new Regex(@".*\.discord\.com", RegexOptions.IgnoreCase), + Rules = ["source"] + }, + new Rule + { + Name = "medium.com", + Match = new Regex(@"medium\.com", RegexOptions.IgnoreCase), + Rules = ["source"] + }, +new Rule + { + Name = "apple.com", + Match = new Regex(@".*\.apple\.com", RegexOptions.IgnoreCase), + Rules = ["uo", "app", "at", "ct", "ls", "pt", "mt", "itsct", "itscg", "referrer", "src", "cid"] + }, + new Rule + { + Name = "music.apple.com", + Match = new Regex(@"music\.apple\.com", RegexOptions.IgnoreCase), + Rules = ["i", "lId", "cId", "sr", "src"] + }, + new Rule + { + Name = "play.google.com", + Match = new Regex(@"play\.google\.com", RegexOptions.IgnoreCase), + Rules = ["referrer", "pcampaignid"] + }, + ]; + } +} \ No newline at end of file diff --git a/BotNet.Services/CleanUrl/UrlCleaner.cs b/BotNet.Services/CleanUrl/UrlCleaner.cs new file mode 100644 index 0000000..83741bb --- /dev/null +++ b/BotNet.Services/CleanUrl/UrlCleaner.cs @@ -0,0 +1,26 @@ +using System; +using System.Text.RegularExpressions; + +namespace BotNet.Services.UrlCleaner { + public partial class UrlCleaner { + /// + /// Cleans the specified URL by removing query parameters based on predefined rules. + /// + /// The URL to be cleaned. + /// A cleaned URI. + public static Uri Clean(Uri url) { + foreach (Rule rule in RuleData.Rules) { + if (rule.Match.IsMatch(url.ToString())) { + foreach (string r in rule.Rules) { + url = new Uri(Regex.Replace(url.ToString(), $"[&?]({r})=[^&]*", "")); + } + } + } + + // Remove trailing '?' or '&' if present + string cleanedUrl = url.ToString().TrimEnd('?', '&').TrimEnd('/'); + + return new Uri(cleanedUrl); + } + } +} \ No newline at end of file diff --git a/BotNet.Tests/Services/CleanUrl/UrlCleanerTests.cs b/BotNet.Tests/Services/CleanUrl/UrlCleanerTests.cs new file mode 100644 index 0000000..d50dd04 --- /dev/null +++ b/BotNet.Tests/Services/CleanUrl/UrlCleanerTests.cs @@ -0,0 +1,19 @@ +using Xunit; +using BotNet.Services.UrlCleaner; + +namespace BotNet.Tests.Services.CleanUrl { + public class UrlCleanerTests { + + [Theory] + [InlineData("https://nasional.kompas.com/read/2024/01/10/17560541/jokowi-belum-ucapkan-selamat-ultah-ke-pdi-p-ganjar-lupa-kali?utm_source=Telegram&utm_medium=Referral&utm_campaign=Top_Desktop", "https://nasional.kompas.com/read/2024/01/10/17560541/jokowi-belum-ucapkan-selamat-ultah-ke-pdi-p-ganjar-lupa-kali")] + [InlineData("https://www.reddit.com/r/indonesia/comments/10nc28j/kerugian_udah_tembus_7m_gara2_bug_promo/?$deep_link=true&correlation_id=b1d34957-35e3-4ce1-9120-eb111509ae81&post_fullname=t3_10nc28j&post_index=1&ref=email_digest&ref_campaign=email_digest&ref_source=email&utm_content=post_title&utm_medium=Email%20Amazon%20SES&$3p=e_as&_branch_match_id=696254937267305114&_branch_referrer=H4sIAAAAAAAAA22QXWrDMBCET%2BO%2B2Yksp0kKoRRKr7GspY2jRH9IK9Ljd920fSpIMHyj3Rl0Yc71ZbMpZK3jAXMevIu3jc6v3TjpfCLA%2BiQyFbe4iB5a8afLOtXpt278kHO%2F34efeZOCgCLXRZsiVYeihQaKXEWqbTTj4SrqRqUtDiM0ixdgCnOrsA%2BwYMER5rZALimkNUJLymSJMqzdOv3OpVE3PptUCnlklyI4K3xWVk%2FH3b7XO9L9ZEj1RzVue5qVUrvtEemgZC6nynBu3kcMtK7T8NfrYUp7%2BhRnfV3oLIoCOg%2FWLVT5AcFgyOiW%2BL9bUyuGfj2BjQOYFFl%2BQuh3DDv29AW7S%2FV8gwEAAA%3D%3D", "https://www.reddit.com/r/indonesia/comments/10nc28j/kerugian_udah_tembus_7m_gara2_bug_promo/?$deep_link=true&$3p=e_as")] + [InlineData("https://www.kaorinusantara.or.id/newsline/194064/kak-seto-beneran-jadi-seto-kaiba-di-google?fbclid=IwAR2TTZgHLAAYJtZj_L5MKRGrHzrCa04_y8SMwYG-cteuyL6A5u1VVDjqh_c", "https://www.kaorinusantara.or.id/newsline/194064/kak-seto-beneran-jadi-seto-kaiba-di-google")] + [InlineData("https://www.facebook.com/groups/informatika.cringeposting/permalink/1110311033679168/?ref=share&mibextid=Cw5JYn", "https://www.facebook.com/groups/informatika.cringeposting/permalink/1110311033679168")] + [InlineData("https://www.instagram.com/reel/CvOeEfJhG0f/?igshid=NTc4MTIwNjQ2YQ%3D%3D", "https://www.instagram.com/reel/CvOeEfJhG0f")] + [InlineData("https://twitter.com/petergyang/status/1573489316147306496?ref_src=twsrc%5Etfw%7Ctwcamp%5Etweetembed%7Ctwterm%5E1573489316147306496%7Ctwgr%5E9bfbec9d831b2a896ffc769afc3b65024c52850b%7Ctwcon%5Es1_&ref_url=https%3A%2F%2Fgames.ensipedia.id%2Fnews%2Fcerdas-mahasiswa-ini-manfaatkan-ai-untuk-mengerjakan-tugas-kuliah-dan-dapat-nilai-a%2F", "https://twitter.com/petergyang/status/1573489316147306496")] + public void CleanUrl_ShouldRemoveQueryParametersBasedOnRules(string url, string result) { + string cleanedUrl = UrlCleaner.Clean(new System.Uri(url)).ToString(); + Assert.Equal(result, cleanedUrl); + } + } +} \ No newline at end of file From 3e31148fcd6a9615637695fda94360669ab054da Mon Sep 17 00:00:00 2001 From: Bagas Wastu <67826350+bgwastu@users.noreply.github.com> Date: Sat, 13 Jan 2024 12:50:08 +0700 Subject: [PATCH 2/4] fix: bing support --- BotNet.Services/CleanUrl/Rule.cs | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/BotNet.Services/CleanUrl/Rule.cs b/BotNet.Services/CleanUrl/Rule.cs index 0a7bd13..188dd0d 100644 --- a/BotNet.Services/CleanUrl/Rule.cs +++ b/BotNet.Services/CleanUrl/Rule.cs @@ -155,6 +155,16 @@ public static class RuleData { Match = new Regex(@"play\.google\.com", RegexOptions.IgnoreCase), Rules = ["referrer", "pcampaignid"] }, + new Rule + { + Name = "bing.com", + Match = new Regex(@"^www\.bing\.com", RegexOptions.IgnoreCase), + Rules = [ + "qs", "form", "sp", "pq", "sc", "sk", "cvid", "FORM", + "ck", "simid", "thid", "cdnurl", "pivotparams", "ghsh", "ghacc", + "ccid", "", "ru" + ] + } ]; } } \ No newline at end of file From b4dc2ea27bbac3e9f6612854fae50c8378180b15 Mon Sep 17 00:00:00 2001 From: Bagas Wastu <67826350+bgwastu@users.noreply.github.com> Date: Sat, 13 Jan 2024 13:10:10 +0700 Subject: [PATCH 3/4] fix: match tracker params without payload --- BotNet.Services/CleanUrl/UrlCleaner.cs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/BotNet.Services/CleanUrl/UrlCleaner.cs b/BotNet.Services/CleanUrl/UrlCleaner.cs index 83741bb..09a1017 100644 --- a/BotNet.Services/CleanUrl/UrlCleaner.cs +++ b/BotNet.Services/CleanUrl/UrlCleaner.cs @@ -12,7 +12,7 @@ public static Uri Clean(Uri url) { foreach (Rule rule in RuleData.Rules) { if (rule.Match.IsMatch(url.ToString())) { foreach (string r in rule.Rules) { - url = new Uri(Regex.Replace(url.ToString(), $"[&?]({r})=[^&]*", "")); + url = new Uri(Regex.Replace(url.ToString(), $"[&?]({r})=?[^&]*", "")); } } } From 0fd704b6301c1deb7059d195254ee2eddb38cfa5 Mon Sep 17 00:00:00 2001 From: Bagas Wastu <67826350+bgwastu@users.noreply.github.com> Date: Sat, 13 Jan 2024 13:10:48 +0700 Subject: [PATCH 4/4] fix: imrove fb tracker detection --- BotNet.Services/CleanUrl/Rule.cs | 2 +- BotNet.Tests/Services/CleanUrl/UrlCleanerTests.cs | 1 + 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/BotNet.Services/CleanUrl/Rule.cs b/BotNet.Services/CleanUrl/Rule.cs index 188dd0d..89a3272 100644 --- a/BotNet.Services/CleanUrl/Rule.cs +++ b/BotNet.Services/CleanUrl/Rule.cs @@ -94,7 +94,7 @@ public static class RuleData { Match = new Regex(@".*\.facebook\.com", RegexOptions.IgnoreCase), Rules = [ - "fbclid", "fb_ref", "fb_source", "referral_code", "referral_story_type", "tracking", "ref", "mibextid", "app" + "fbclid", "fb_ref", "fb_source", "referral_code", "referral_story_type", "tracking", "ref", "mibextid", "app", "_rdr", "m_entstream_source", "paipv", "locale", "eav" ], }, new Rule diff --git a/BotNet.Tests/Services/CleanUrl/UrlCleanerTests.cs b/BotNet.Tests/Services/CleanUrl/UrlCleanerTests.cs index d50dd04..e6ab120 100644 --- a/BotNet.Tests/Services/CleanUrl/UrlCleanerTests.cs +++ b/BotNet.Tests/Services/CleanUrl/UrlCleanerTests.cs @@ -10,6 +10,7 @@ public class UrlCleanerTests { [InlineData("https://www.kaorinusantara.or.id/newsline/194064/kak-seto-beneran-jadi-seto-kaiba-di-google?fbclid=IwAR2TTZgHLAAYJtZj_L5MKRGrHzrCa04_y8SMwYG-cteuyL6A5u1VVDjqh_c", "https://www.kaorinusantara.or.id/newsline/194064/kak-seto-beneran-jadi-seto-kaiba-di-google")] [InlineData("https://www.facebook.com/groups/informatika.cringeposting/permalink/1110311033679168/?ref=share&mibextid=Cw5JYn", "https://www.facebook.com/groups/informatika.cringeposting/permalink/1110311033679168")] [InlineData("https://www.instagram.com/reel/CvOeEfJhG0f/?igshid=NTc4MTIwNjQ2YQ%3D%3D", "https://www.instagram.com/reel/CvOeEfJhG0f")] + [InlineData("https://www.facebook.com/story.php?story_fbid=410430604519673&id=109479077948162&m_entstream_source=permalink&locale=ms_MY&paipv=0&eav=AfYTS-fXj_ioV3KMOuFjUx1hAV_g_FQX1W_Jfxi1SowzDV9LarINjBexw46zqBxKCTo&_rdr", "https://www.facebook.com/story.php?story_fbid=410430604519673&id=109479077948162")] [InlineData("https://twitter.com/petergyang/status/1573489316147306496?ref_src=twsrc%5Etfw%7Ctwcamp%5Etweetembed%7Ctwterm%5E1573489316147306496%7Ctwgr%5E9bfbec9d831b2a896ffc769afc3b65024c52850b%7Ctwcon%5Es1_&ref_url=https%3A%2F%2Fgames.ensipedia.id%2Fnews%2Fcerdas-mahasiswa-ini-manfaatkan-ai-untuk-mengerjakan-tugas-kuliah-dan-dapat-nilai-a%2F", "https://twitter.com/petergyang/status/1573489316147306496")] public void CleanUrl_ShouldRemoveQueryParametersBasedOnRules(string url, string result) { string cleanedUrl = UrlCleaner.Clean(new System.Uri(url)).ToString();