From 5f717a9f16535db63839ef1a801b5fcf6b25ebfc Mon Sep 17 00:00:00 2001 From: Niranjan Artal Date: Tue, 31 Dec 2024 14:36:23 -0800 Subject: [PATCH 1/5] fix for nested type Signed-off-by: Niranjan Artal --- src/main/cpp/src/NativeParquetJni.cpp | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/src/main/cpp/src/NativeParquetJni.cpp b/src/main/cpp/src/NativeParquetJni.cpp index 2a1c00b86e..906fa4304e 100644 --- a/src/main/cpp/src/NativeParquetJni.cpp +++ b/src/main/cpp/src/NativeParquetJni.cpp @@ -721,6 +721,23 @@ Java_com_nvidia_spark_rapids_jni_ParquetFooter_readAndFilter(JNIEnv* env, parent_num_children); auto filter = pruner.filter_schema(meta->schema, ignore_case); + if (filter.schema_map.empty()) { + // Possibly still filter row groups by offset, so rowCount is correct + std::vector filtered_row_groups; + if (part_length >= 0) { + filtered_row_groups = rapids::jni::filter_groups( + *meta, part_offset, part_length); + } + meta->row_groups = std::move(filtered_row_groups); + // Clear schema => physically 0 columns remain + meta->schema.clear(); + //meta->__isset.schema = true; + // (We keep meta->num_rows unset or derived from row groups at read time.) + + // Return the pruned metadata + return cudf::jni::release_as_jlong(meta); + } + // start by filtering the schema and the chunks std::size_t new_schema_size = filter.schema_map.size(); std::vector new_schema(new_schema_size); From 21fb7453149d918121978fa084e189a4090d9434 Mon Sep 17 00:00:00 2001 From: Niranjan Artal Date: Mon, 6 Jan 2025 17:41:28 -0800 Subject: [PATCH 2/5] Improve Struct Pruning Logic for Parquet Schema Filtering Signed-off-by: Niranjan Artal --- src/main/cpp/src/NativeParquetJni.cpp | 65 +++++++++++++++------------ 1 file changed, 37 insertions(+), 28 deletions(-) diff --git a/src/main/cpp/src/NativeParquetJni.cpp b/src/main/cpp/src/NativeParquetJni.cpp index 906fa4304e..b6c3fe12c8 100644 --- a/src/main/cpp/src/NativeParquetJni.cpp +++ b/src/main/cpp/src/NativeParquetJni.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2022-2024, NVIDIA CORPORATION. + * Copyright (c) 2022-2025, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -204,13 +204,23 @@ class column_pruner { if (is_leaf) { throw std::runtime_error("Found a leaf node, but expected to find a struct"); } int num_children = get_num_children(struct_schema_item); - // Now that everything looks good add ourselves into the maps, and move to the next entry to - // look at. - schema_map.push_back(current_input_schema_index); - // We will update the num_children each time we find one... - int our_num_children_index = schema_num_children.size(); - schema_num_children.push_back(0); + + // Save the current index (the position of this Struct in the full schema). + // We'll need this to add it to the pruned schema map if it qualifies. + int struct_index = current_input_schema_index; + + // We add the Struct to schema_map (with a placeholder for child count). + // But we might remove it later if no children match. This ensures parent-before-children + // ordering in schema_map. + schema_map.push_back(struct_index); + schema_num_children.push_back(0); // Placeholder for child count + + // This is necessary to update the child count accurately. + // Since we just added a new element to schema_num_children, its position is the last index. + int struct_map_position = schema_num_children.size() - 1; + // Move to first child ++current_input_schema_index; + int matched_children_count = 0; // For a STRUCT we want to look for all of the children that match the name and let each of them // handle updating things themselves. @@ -221,9 +231,10 @@ class column_pruner { auto found = children.find(name); if (found != children.end()) { - // found a match so update the number of children that passed the filter and ask it to - // filter itself. - ++schema_num_children[our_num_children_index]; + + // Record the current Size of schema_map before processing the child + // This helps determine if the child was retained after processing. + std::size_t before_child_size = schema_map.size(); found->second.filter_schema(schema, ignore_case, current_input_schema_index, @@ -231,11 +242,26 @@ class column_pruner { chunk_map, schema_map, schema_num_children); + + // If the size of schema_map has increased, it means that child was successfully + // included + if (schema_map.size() > before_child_size) { + ++matched_children_count; + // Update the child count for the struct + schema_num_children[struct_map_position]++; + } } else { - // No match was found so skip the child. + // Skip non-matching children skip(schema, current_input_schema_index, next_input_chunk_index); } } + + // If the below condition is true, it means none of this Struct's children + // were included. We should remove it from `schema_map` and `schema_num_children`. + if (matched_children_count == 0) { + schema_map.pop_back(); + schema_num_children.pop_back(); + } } /** @@ -721,23 +747,6 @@ Java_com_nvidia_spark_rapids_jni_ParquetFooter_readAndFilter(JNIEnv* env, parent_num_children); auto filter = pruner.filter_schema(meta->schema, ignore_case); - if (filter.schema_map.empty()) { - // Possibly still filter row groups by offset, so rowCount is correct - std::vector filtered_row_groups; - if (part_length >= 0) { - filtered_row_groups = rapids::jni::filter_groups( - *meta, part_offset, part_length); - } - meta->row_groups = std::move(filtered_row_groups); - // Clear schema => physically 0 columns remain - meta->schema.clear(); - //meta->__isset.schema = true; - // (We keep meta->num_rows unset or derived from row groups at read time.) - - // Return the pruned metadata - return cudf::jni::release_as_jlong(meta); - } - // start by filtering the schema and the chunks std::size_t new_schema_size = filter.schema_map.size(); std::vector new_schema(new_schema_size); From fa9907d14a4074030d2fdab9b0881fa4c939099b Mon Sep 17 00:00:00 2001 From: Niranjan Artal Date: Mon, 6 Jan 2025 17:52:13 -0800 Subject: [PATCH 3/5] Update comments --- src/main/cpp/src/NativeParquetJni.cpp | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/main/cpp/src/NativeParquetJni.cpp b/src/main/cpp/src/NativeParquetJni.cpp index b6c3fe12c8..393ca72bfb 100644 --- a/src/main/cpp/src/NativeParquetJni.cpp +++ b/src/main/cpp/src/NativeParquetJni.cpp @@ -205,7 +205,7 @@ class column_pruner { int num_children = get_num_children(struct_schema_item); - // Save the current index (the position of this Struct in the full schema). + // Save the current index (the position of this struct in the full schema). // We'll need this to add it to the pruned schema map if it qualifies. int struct_index = current_input_schema_index; @@ -232,7 +232,7 @@ class column_pruner { if (found != children.end()) { - // Record the current Size of schema_map before processing the child + // Record the current size of schema_map before processing the child // This helps determine if the child was retained after processing. std::size_t before_child_size = schema_map.size(); found->second.filter_schema(schema, @@ -251,12 +251,12 @@ class column_pruner { schema_num_children[struct_map_position]++; } } else { - // Skip non-matching children + // No match was found so skip the child skip(schema, current_input_schema_index, next_input_chunk_index); } } - // If the below condition is true, it means none of this Struct's children + // If the below condition is true, it means none of this struct's children // were included. We should remove it from `schema_map` and `schema_num_children`. if (matched_children_count == 0) { schema_map.pop_back(); From 5d6953672a552099cf360c0381494e5b28b2ea8a Mon Sep 17 00:00:00 2001 From: Niranjan Artal Date: Mon, 6 Jan 2025 18:09:39 -0800 Subject: [PATCH 4/5] Update clang format --- src/main/cpp/src/NativeParquetJni.cpp | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/src/main/cpp/src/NativeParquetJni.cpp b/src/main/cpp/src/NativeParquetJni.cpp index 393ca72bfb..1c7d877a9f 100644 --- a/src/main/cpp/src/NativeParquetJni.cpp +++ b/src/main/cpp/src/NativeParquetJni.cpp @@ -213,7 +213,7 @@ class column_pruner { // But we might remove it later if no children match. This ensures parent-before-children // ordering in schema_map. schema_map.push_back(struct_index); - schema_num_children.push_back(0); // Placeholder for child count + schema_num_children.push_back(0); // Placeholder for child count // This is necessary to update the child count accurately. // Since we just added a new element to schema_num_children, its position is the last index. @@ -231,7 +231,6 @@ class column_pruner { auto found = children.find(name); if (found != children.end()) { - // Record the current size of schema_map before processing the child // This helps determine if the child was retained after processing. std::size_t before_child_size = schema_map.size(); @@ -259,8 +258,8 @@ class column_pruner { // If the below condition is true, it means none of this struct's children // were included. We should remove it from `schema_map` and `schema_num_children`. if (matched_children_count == 0) { - schema_map.pop_back(); - schema_num_children.pop_back(); + schema_map.pop_back(); + schema_num_children.pop_back(); } } From a564c9e59439d26255ee0a562599cf4af6f956b7 Mon Sep 17 00:00:00 2001 From: Niranjan Artal Date: Tue, 7 Jan 2025 13:33:21 -0800 Subject: [PATCH 5/5] addressed review comments Signed-off-by: Niranjan Artal --- src/main/cpp/src/NativeParquetJni.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/main/cpp/src/NativeParquetJni.cpp b/src/main/cpp/src/NativeParquetJni.cpp index 1c7d877a9f..7c32220913 100644 --- a/src/main/cpp/src/NativeParquetJni.cpp +++ b/src/main/cpp/src/NativeParquetJni.cpp @@ -233,7 +233,7 @@ class column_pruner { if (found != children.end()) { // Record the current size of schema_map before processing the child // This helps determine if the child was retained after processing. - std::size_t before_child_size = schema_map.size(); + auto const before_child_size = schema_map.size(); found->second.filter_schema(schema, ignore_case, current_input_schema_index, @@ -250,7 +250,7 @@ class column_pruner { schema_num_children[struct_map_position]++; } } else { - // No match was found so skip the child + // No match was found so skip the child. skip(schema, current_input_schema_index, next_input_chunk_index); } }