-
Notifications
You must be signed in to change notification settings - Fork 1.5k
GH-3601: Cache shouldIgnoreStatistics version parsing result #3607
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: master
Are you sure you want to change the base?
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -970,6 +970,46 @@ public org.apache.parquet.column.statistics.Statistics fromParquetStatistics( | |
| return fromParquetStatisticsInternal(createdBy, statistics, type, expectedOrder); | ||
| } | ||
|
|
||
| // Overload that uses a pre-computed shouldIgnoreCorruptStats flag to avoid redundant parsing | ||
| private org.apache.parquet.column.statistics.Statistics fromParquetStatisticsInternal( | ||
| String createdBy, Statistics formatStats, PrimitiveType type, boolean shouldIgnoreCorruptStats) { | ||
| SortOrder typeSortOrder = overrideSortOrderToSigned(type) ? SortOrder.SIGNED : sortOrder(type); | ||
| org.apache.parquet.column.statistics.Statistics.Builder statsBuilder = | ||
| org.apache.parquet.column.statistics.Statistics.getBuilderForReading(type); | ||
|
|
||
| if (formatStats != null) { | ||
| if (formatStats.isSetMin_value() && formatStats.isSetMax_value()) { | ||
| byte[] min = formatStats.min_value.array(); | ||
| byte[] max = formatStats.max_value.array(); | ||
| if (isMinMaxStatsSupported(type) || Arrays.equals(min, max)) { | ||
| statsBuilder.withMin(min); | ||
| statsBuilder.withMax(max); | ||
| } | ||
| } else { | ||
| boolean isSet = formatStats.isSetMax() && formatStats.isSetMin(); | ||
| boolean maxEqualsMin = isSet ? Arrays.equals(formatStats.getMin(), formatStats.getMax()) : false; | ||
| boolean sortOrdersMatch = SortOrder.SIGNED == typeSortOrder; | ||
| // The shouldIgnoreCorruptStats flag applies only to BINARY and FIXED_LEN_BYTE_ARRAY. | ||
| // For other types, shouldIgnoreStatistics always returns false, so we only guard those. | ||
| PrimitiveTypeName primitiveTypeName = type.getPrimitiveTypeName(); | ||
| boolean ignoreForThisColumn = shouldIgnoreCorruptStats | ||
| && (primitiveTypeName == PrimitiveTypeName.BINARY | ||
| || primitiveTypeName == PrimitiveTypeName.FIXED_LEN_BYTE_ARRAY); | ||
| if (!ignoreForThisColumn && (sortOrdersMatch || maxEqualsMin)) { | ||
|
Comment on lines
+995
to
+998
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This check in We can utilize the new methods here. Please refer to below comments for context. Instead of calling or moving the |
||
| if (isSet) { | ||
| statsBuilder.withMin(formatStats.min.array()); | ||
| statsBuilder.withMax(formatStats.max.array()); | ||
| } | ||
| } | ||
| } | ||
|
|
||
| if (formatStats.isSetNull_count()) { | ||
| statsBuilder.withNumNulls(formatStats.null_count); | ||
| } | ||
| } | ||
| return statsBuilder.build(); | ||
| } | ||
|
|
||
| GeospatialStatistics toParquetGeospatialStatistics( | ||
| org.apache.parquet.column.statistics.geospatial.GeospatialStatistics geospatialStatistics) { | ||
| if (geospatialStatistics == null) { | ||
|
|
@@ -1794,13 +1834,24 @@ public FileMetaDataAndRowGroupOffsetInfo visit(RangeMetadataFilter filter) throw | |
|
|
||
| public ColumnChunkMetaData buildColumnChunkMetaData( | ||
| ColumnMetaData metaData, ColumnPath columnPath, PrimitiveType type, String createdBy) { | ||
| boolean shouldIgnoreCorruptStats = | ||
| CorruptStatistics.shouldIgnoreStatistics(createdBy, PrimitiveTypeName.BINARY); | ||
| return buildColumnChunkMetaData(metaData, columnPath, type, createdBy, shouldIgnoreCorruptStats); | ||
|
Comment on lines
+1837
to
+1839
|
||
| } | ||
|
|
||
| ColumnChunkMetaData buildColumnChunkMetaData( | ||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
No need to pass createdBy downstream, the boolean is all the internal overload needs. SortOrder computation moves here since we bypass Also notice how the new public methods we extracted in |
||
| ColumnMetaData metaData, | ||
| ColumnPath columnPath, | ||
| PrimitiveType type, | ||
| String createdBy, | ||
| boolean shouldIgnoreCorruptStats) { | ||
| return ColumnChunkMetaData.get( | ||
| columnPath, | ||
| type, | ||
| fromFormatCodec(metaData.codec), | ||
| convertEncodingStats(metaData.getEncoding_stats()), | ||
| fromFormatEncodings(metaData.encodings), | ||
| fromParquetStatistics(createdBy, metaData.statistics, type), | ||
| fromParquetStatisticsInternal(createdBy, metaData.statistics, type, shouldIgnoreCorruptStats), | ||
| metaData.data_page_offset, | ||
| metaData.dictionary_page_offset, | ||
| metaData.num_values, | ||
|
|
@@ -1829,6 +1880,10 @@ public ParquetMetadata fromParquetMetadata( | |
| MessageType messageType = fromParquetSchema(parquetMetadata.getSchema(), parquetMetadata.getColumn_orders()); | ||
| List<BlockMetaData> blocks = new ArrayList<BlockMetaData>(); | ||
| List<RowGroup> row_groups = parquetMetadata.getRow_groups(); | ||
| // Compute once per file: the result is the same for BINARY and FIXED_LEN_BYTE_ARRAY | ||
| // (the only types affected by PARQUET-251), and always false for other types. | ||
|
Comment on lines
+1883
to
+1884
|
||
| boolean shouldIgnoreCorruptStats = | ||
| CorruptStatistics.shouldIgnoreStatistics(parquetMetadata.getCreated_by(), PrimitiveTypeName.BINARY); | ||
|
Comment on lines
+1883
to
+1886
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This is calling Instead we can refactor |
||
|
|
||
| if (row_groups != null) { | ||
| for (RowGroup rowGroup : row_groups) { | ||
|
|
@@ -1909,7 +1964,8 @@ public ParquetMetadata fromParquetMetadata( | |
| metaData, | ||
| columnPath, | ||
| messageType.getType(columnPath.toArray()).asPrimitiveType(), | ||
| createdBy); | ||
| createdBy, | ||
| shouldIgnoreCorruptStats); | ||
| column.setRowGroupOrdinal(rowGroup.getOrdinal()); | ||
| if (metaData.isSetBloom_filter_offset()) { | ||
| column.setBloomFilterOffset(metaData.getBloom_filter_offset()); | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Instead of duplicating the entire
fromParquetStatisticsInternalbody, the existing method can simply delegate to a new overload and this eliminates duplicate code