From 9999d5ee238c653fa0bb2d3455eb8b52eda3a9c8 Mon Sep 17 00:00:00 2001 From: Oguzhan Unlu Date: Mon, 8 Apr 2024 16:11:16 +0300 Subject: [PATCH] Remove too long atomic fields (close #889) Co-Authored-By: Benjamin BENOIST --- .../common/enrichments/AtomicFields.scala | 171 +++++++++--------- .../AtomicFieldsLengthValidator.scala | 13 +- .../enrichments/EnrichmentManager.scala | 8 +- .../enrichments/EnrichmentManagerSpec.scala | 21 +++ 4 files changed, 120 insertions(+), 93 deletions(-) diff --git a/modules/common/src/main/scala/com.snowplowanalytics.snowplow.enrich/common/enrichments/AtomicFields.scala b/modules/common/src/main/scala/com.snowplowanalytics.snowplow.enrich/common/enrichments/AtomicFields.scala index 2ddbc83b4..0bf386417 100644 --- a/modules/common/src/main/scala/com.snowplowanalytics.snowplow.enrich/common/enrichments/AtomicFields.scala +++ b/modules/common/src/main/scala/com.snowplowanalytics.snowplow.enrich/common/enrichments/AtomicFields.scala @@ -32,97 +32,98 @@ object AtomicFields { final case class AtomicField( name: String, - enrichedValueExtractor: EnrichedEvent => String + enrichedValueExtractor: EnrichedEvent => String, + nullify: EnrichedEvent => Unit ) final case class LimitedAtomicField(value: AtomicField, limit: Int) // format: off val supportedFields: List[AtomicField] = List( - AtomicField(name = "app_id", _.app_id ), - AtomicField(name = "platform", _.platform ), - AtomicField(name = "event", _.event ), - AtomicField(name = "event_id", _.event ), - AtomicField(name = "name_tracker", _.name_tracker ), - AtomicField(name = "v_tracker", _.v_tracker ), - AtomicField(name = "v_collector", _.v_collector ), - AtomicField(name = "v_etl", _.v_etl ), - AtomicField(name = "user_id", _.user_id ), - AtomicField(name = "user_ipaddress", _.user_ipaddress ), - AtomicField(name = "user_fingerprint", _.user_fingerprint ), - AtomicField(name = "domain_userid", _.domain_userid ), - AtomicField(name = "network_userid", _.network_userid ), - AtomicField(name = "geo_country", _.geo_country ), - AtomicField(name = "geo_region", _.geo_region ), - AtomicField(name = "geo_city", _.geo_city ), - AtomicField(name = "geo_zipcode", _.geo_zipcode ), - AtomicField(name = "geo_region_name", _.geo_region_name ), - AtomicField(name = "ip_isp", _.ip_isp ), - AtomicField(name = "ip_organization", _.ip_organization ), - AtomicField(name = "ip_domain", _.ip_domain ), - AtomicField(name = "ip_netspeed", _.ip_netspeed ), - AtomicField(name = "page_url", _.page_url ), - AtomicField(name = "page_title", _.page_title ), - AtomicField(name = "page_referrer", _.page_referrer ), - AtomicField(name = "page_urlscheme", _.page_urlscheme ), - AtomicField(name = "page_urlhost", _.page_urlhost ), - AtomicField(name = "page_urlpath", _.page_urlpath ), - AtomicField(name = "page_urlquery", _.page_urlquery ), - AtomicField(name = "page_urlfragment", _.page_urlfragment ), - AtomicField(name = "refr_urlscheme", _.refr_urlscheme ), - AtomicField(name = "refr_urlhost", _.refr_urlhost ), - AtomicField(name = "refr_urlpath", _.refr_urlpath ), - AtomicField(name = "refr_urlquery", _.refr_urlquery ), - AtomicField(name = "refr_urlfragment", _.refr_urlfragment ), - AtomicField(name = "refr_medium", _.refr_medium ), - AtomicField(name = "refr_source", _.refr_source ), - AtomicField(name = "refr_term", _.refr_term ), - AtomicField(name = "mkt_medium", _.mkt_medium ), - AtomicField(name = "mkt_source", _.mkt_source ), - AtomicField(name = "mkt_term", _.mkt_term ), - AtomicField(name = "mkt_content", _.mkt_content ), - AtomicField(name = "mkt_campaign", _.mkt_campaign ), - AtomicField(name = "se_category", _.se_category ), - AtomicField(name = "se_action", _.se_action ), - AtomicField(name = "se_label", _.se_label ), - AtomicField(name = "se_property", _.se_property ), - AtomicField(name = "tr_orderid", _.tr_orderid ), - AtomicField(name = "tr_affiliation", _.tr_affiliation ), - AtomicField(name = "tr_city", _.tr_city ), - AtomicField(name = "tr_state", _.tr_state ), - AtomicField(name = "tr_country", _.tr_country ), - AtomicField(name = "ti_orderid", _.ti_orderid ), - AtomicField(name = "ti_sku", _.ti_sku ), - AtomicField(name = "ti_name", _.ti_name ), - AtomicField(name = "ti_category", _.ti_category ), - AtomicField(name = "useragent", _.useragent ), - AtomicField(name = "br_name", _.br_name ), - AtomicField(name = "br_family", _.br_family ), - AtomicField(name = "br_version", _.br_version ), - AtomicField(name = "br_type", _.br_type ), - AtomicField(name = "br_renderengine", _.br_renderengine ), - AtomicField(name = "br_lang", _.br_lang ), - AtomicField(name = "br_colordepth", _.br_colordepth ), - AtomicField(name = "os_name", _.os_name ), - AtomicField(name = "os_family", _.os_family ), - AtomicField(name = "os_manufacturer", _.os_manufacturer ), - AtomicField(name = "os_timezone", _.os_timezone ), - AtomicField(name = "dvce_type", _.dvce_type ), - AtomicField(name = "doc_charset", _.doc_charset ), - AtomicField(name = "tr_currency", _.tr_currency ), - AtomicField(name = "ti_currency", _.ti_currency ), - AtomicField(name = "base_currency", _.base_currency ), - AtomicField(name = "geo_timezone", _.geo_timezone ), - AtomicField(name = "mkt_clickid", _.mkt_clickid ), - AtomicField(name = "mkt_network", _.mkt_network ), - AtomicField(name = "etl_tags", _.etl_tags ), - AtomicField(name = "refr_domain_userid", _.refr_domain_userid ), - AtomicField(name = "domain_sessionid", _.domain_sessionid ), - AtomicField(name = "event_vendor", _.event_vendor ), - AtomicField(name = "event_name", _.event_name ), - AtomicField(name = "event_format", _.event_format ), - AtomicField(name = "event_version", _.event_version ), - AtomicField(name = "event_fingerprint", _.event_fingerprint ), + AtomicField(name = "app_id", _.app_id, _.app_id = null), + AtomicField(name = "platform", _.platform, _.platform = null), + AtomicField(name = "event", _.event, _.event = null), + AtomicField(name = "event_id", _.event_id, _ => ()), // required in loading + AtomicField(name = "name_tracker", _.name_tracker, _.name_tracker = null), + AtomicField(name = "v_tracker", _.v_tracker, _.v_tracker = null), + AtomicField(name = "v_collector", _.v_collector, _ => ()), // required in loading + AtomicField(name = "v_etl", _.v_etl, _ => ()), // required in loading + AtomicField(name = "user_id", _.user_id, _.user_id = null), + AtomicField(name = "user_ipaddress", _.user_ipaddress, _.user_ipaddress = null), + AtomicField(name = "user_fingerprint", _.user_fingerprint, _.user_fingerprint = null), + AtomicField(name = "domain_userid", _.domain_userid, _.domain_userid = null), + AtomicField(name = "network_userid", _.network_userid, _.network_userid = null), + AtomicField(name = "geo_country", _.geo_country, _.geo_country = null), + AtomicField(name = "geo_region", _.geo_region, _.geo_region = null), + AtomicField(name = "geo_city", _.geo_city, _.geo_city = null), + AtomicField(name = "geo_zipcode", _.geo_zipcode, _.geo_zipcode = null), + AtomicField(name = "geo_region_name", _.geo_region_name, _.geo_region_name = null), + AtomicField(name = "ip_isp", _.ip_isp, _.ip_isp = null), + AtomicField(name = "ip_organization", _.ip_organization, _.ip_organization = null), + AtomicField(name = "ip_domain", _.ip_domain, _.ip_domain = null), + AtomicField(name = "ip_netspeed", _.ip_netspeed, _.ip_netspeed = null), + AtomicField(name = "page_url", _.page_url, _.page_url = null), + AtomicField(name = "page_title", _.page_title, _.page_title = null), + AtomicField(name = "page_referrer", _.page_referrer, _.page_referrer = null), + AtomicField(name = "page_urlscheme", _.page_urlscheme, _.page_urlscheme = null), + AtomicField(name = "page_urlhost", _.page_urlhost, _.page_urlhost = null), + AtomicField(name = "page_urlpath", _.page_urlpath, _.page_urlpath = null), + AtomicField(name = "page_urlquery", _.page_urlquery, _.page_urlquery = null), + AtomicField(name = "page_urlfragment", _.page_urlfragment, _.page_urlfragment = null), + AtomicField(name = "refr_urlscheme", _.refr_urlscheme, _.refr_urlscheme = null), + AtomicField(name = "refr_urlhost", _.refr_urlhost, _.refr_urlhost = null), + AtomicField(name = "refr_urlpath", _.refr_urlpath, _.refr_urlpath = null), + AtomicField(name = "refr_urlquery", _.refr_urlquery, _.refr_urlquery = null), + AtomicField(name = "refr_urlfragment", _.refr_urlfragment, _.refr_urlfragment = null), + AtomicField(name = "refr_medium", _.refr_medium, _.refr_medium = null), + AtomicField(name = "refr_source", _.refr_source, _.refr_source = null), + AtomicField(name = "refr_term", _.refr_term, _.refr_term = null), + AtomicField(name = "mkt_medium", _.mkt_medium, _.mkt_medium = null), + AtomicField(name = "mkt_source", _.mkt_source, _.mkt_source = null), + AtomicField(name = "mkt_term", _.mkt_term, _.mkt_term = null), + AtomicField(name = "mkt_content", _.mkt_content, _.mkt_content = null), + AtomicField(name = "mkt_campaign", _.mkt_campaign, _.mkt_campaign = null), + AtomicField(name = "se_category", _.se_category, _.se_category = null), + AtomicField(name = "se_action", _.se_action, _.se_action = null), + AtomicField(name = "se_label", _.se_label, _.se_label = null), + AtomicField(name = "se_property", _.se_property, _.se_property = null), + AtomicField(name = "tr_orderid", _.tr_orderid, _.tr_orderid = null), + AtomicField(name = "tr_affiliation", _.tr_affiliation, _.tr_affiliation = null), + AtomicField(name = "tr_city", _.tr_city, _.tr_city = null), + AtomicField(name = "tr_state", _.tr_state, _.tr_state = null), + AtomicField(name = "tr_country", _.tr_country, _.tr_country = null), + AtomicField(name = "ti_orderid", _.ti_orderid, _.ti_orderid = null), + AtomicField(name = "ti_sku", _.ti_sku, _.ti_sku = null), + AtomicField(name = "ti_name", _.ti_name, _.ti_name = null), + AtomicField(name = "ti_category", _.ti_category, _.ti_category = null), + AtomicField(name = "useragent", _.useragent, _.useragent = null), + AtomicField(name = "br_name", _.br_name, _.br_name = null), + AtomicField(name = "br_family", _.br_family, _.br_family = null), + AtomicField(name = "br_version", _.br_version, _.br_version = null), + AtomicField(name = "br_type", _.br_type, _.br_type = null), + AtomicField(name = "br_renderengine", _.br_renderengine, _.br_renderengine = null), + AtomicField(name = "br_lang", _.br_lang, _.br_lang = null), + AtomicField(name = "br_colordepth", _.br_colordepth, _.br_colordepth = null), + AtomicField(name = "os_name", _.os_name, _.os_name = null), + AtomicField(name = "os_family", _.os_family, _.os_family = null), + AtomicField(name = "os_manufacturer", _.os_manufacturer, _.os_manufacturer = null), + AtomicField(name = "os_timezone", _.os_timezone, _.os_timezone = null), + AtomicField(name = "dvce_type", _.dvce_type, _.dvce_type = null), + AtomicField(name = "doc_charset", _.doc_charset, _.doc_charset = null), + AtomicField(name = "tr_currency", _.tr_currency, _.tr_currency = null), + AtomicField(name = "ti_currency", _.ti_currency, _.ti_currency = null), + AtomicField(name = "base_currency", _.base_currency, _.base_currency = null), + AtomicField(name = "geo_timezone", _.geo_timezone, _.geo_timezone = null), + AtomicField(name = "mkt_clickid", _.mkt_clickid, _.mkt_clickid = null), + AtomicField(name = "mkt_network", _.mkt_network, _.mkt_network = null), + AtomicField(name = "etl_tags", _.etl_tags, _.etl_tags = null), + AtomicField(name = "refr_domain_userid", _.refr_domain_userid, _.refr_domain_userid = null), + AtomicField(name = "domain_sessionid", _.domain_sessionid, _.domain_sessionid = null), + AtomicField(name = "event_vendor", _.event_vendor, _.event_vendor = null), + AtomicField(name = "event_name", _.event_name, _.event_name = null), + AtomicField(name = "event_format", _.event_format, _.event_format = null), + AtomicField(name = "event_version", _.event_version, _.event_version = null), + AtomicField(name = "event_fingerprint", _.event_fingerprint, _.event_fingerprint = null), ) // format: on diff --git a/modules/common/src/main/scala/com.snowplowanalytics.snowplow.enrich/common/enrichments/AtomicFieldsLengthValidator.scala b/modules/common/src/main/scala/com.snowplowanalytics.snowplow.enrich/common/enrichments/AtomicFieldsLengthValidator.scala index 01929f75a..e3d80f94f 100644 --- a/modules/common/src/main/scala/com.snowplowanalytics.snowplow.enrich/common/enrichments/AtomicFieldsLengthValidator.scala +++ b/modules/common/src/main/scala/com.snowplowanalytics.snowplow.enrich/common/enrichments/AtomicFieldsLengthValidator.scala @@ -34,11 +34,12 @@ object AtomicFieldsLengthValidator { event: EnrichedEvent, acceptInvalid: Boolean, invalidCount: F[Unit], - atomicFields: AtomicFields + atomicFields: AtomicFields, + emitIncomplete: Boolean ): IorT[F, Failure.SchemaViolation, Unit] = IorT { atomicFields.value - .map(validateField(event, _).toValidatedNel) + .map(validateField(event, _, emitIncomplete).toValidatedNel) .combineAll match { case Invalid(errors) if acceptInvalid => handleAcceptableErrors(invalidCount, event, errors) *> Monad[F].pure(Ior.Right(())) @@ -51,17 +52,19 @@ object AtomicFieldsLengthValidator { private def validateField( event: EnrichedEvent, - atomicField: LimitedAtomicField + atomicField: LimitedAtomicField, + emitIncomplete: Boolean ): Either[ValidatorReport, Unit] = { val actualValue = atomicField.value.enrichedValueExtractor(event) - if (actualValue != null && actualValue.length > atomicField.limit) + if (actualValue != null && actualValue.length > atomicField.limit) { + if (emitIncomplete) atomicField.value.nullify(event) ValidatorReport( s"Field is longer than maximum allowed size ${atomicField.limit}", Some(atomicField.value.name), Nil, Some(actualValue) ).asLeft - else + } else Right(()) } diff --git a/modules/common/src/main/scala/com.snowplowanalytics.snowplow.enrich/common/enrichments/EnrichmentManager.scala b/modules/common/src/main/scala/com.snowplowanalytics.snowplow.enrich/common/enrichments/EnrichmentManager.scala index 6737dccfe..eaa0b5e66 100644 --- a/modules/common/src/main/scala/com.snowplowanalytics.snowplow.enrich/common/enrichments/EnrichmentManager.scala +++ b/modules/common/src/main/scala/com.snowplowanalytics.snowplow.enrich/common/enrichments/EnrichmentManager.scala @@ -106,7 +106,8 @@ object EnrichmentManager { registryLookup, featureFlags.acceptInvalid, invalidCount, - atomicFields + atomicFields, + emitIncomplete ) .leftMap(NonEmptyList.one) .possiblyExitingEarly(emitIncomplete) @@ -230,12 +231,13 @@ object EnrichmentManager { registryLookup: RegistryLookup[F], acceptInvalid: Boolean, invalidCount: F[Unit], - atomicFields: AtomicFields + atomicFields: AtomicFields, + emitIncomplete: Boolean ): IorT[F, NonEmptyList[Failure], List[SelfDescribingData[Json]]] = for { validContexts <- IgluUtils.validateEnrichmentsContexts[F](client, enrichmentsContexts, registryLookup) _ <- AtomicFieldsLengthValidator - .validate[F](enriched, acceptInvalid, invalidCount, atomicFields) + .validate[F](enriched, acceptInvalid, invalidCount, atomicFields, emitIncomplete) .leftMap { v: Failure => NonEmptyList.one(v) } } yield validContexts diff --git a/modules/common/src/test/scala/com.snowplowanalytics.snowplow.enrich.common/enrichments/EnrichmentManagerSpec.scala b/modules/common/src/test/scala/com.snowplowanalytics.snowplow.enrich.common/enrichments/EnrichmentManagerSpec.scala index fe8317c5d..69db67fb9 100644 --- a/modules/common/src/test/scala/com.snowplowanalytics.snowplow.enrich.common/enrichments/EnrichmentManagerSpec.scala +++ b/modules/common/src/test/scala/com.snowplowanalytics.snowplow.enrich.common/enrichments/EnrichmentManagerSpec.scala @@ -2417,6 +2417,27 @@ class EnrichmentManagerSpec extends Specification with EitherMatchers with CatsE ko(s"[$other] is not a SchemaViolations bad row with 2 IgluError") } } + + "remove an oversized atomic field if emitIncomplete is set to true" >> { + val enriched = EnrichmentManager + .enrichEvent[IO]( + enrichmentReg, + client, + processor, + timestamp, + RawEvent(api, fatBody, None, source, context), + featureFlags = AcceptInvalid.featureFlags.copy(acceptInvalid = false), + IO.unit, + SpecHelpers.registryLookup, + atomicFieldLimits, + emitIncomplete = true + ) + + enriched.value.map { + case Ior.Both(_: BadRow.SchemaViolations, enriched) if Option(enriched.v_tracker).isEmpty => ok + case other => ko(s"[$other] is not a SchemaViolations bad row and an enriched event without tracker version") + } + } } "setDerivedContexts" should {