// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements.  See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership.  The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License.  You may obtain a copy of the License at
//
//   http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied.  See the License for the
// specific language governing permissions and limitations
// under the License.

//! Parquet metadata API
//!
//! Users should use these structures to interact with Parquet metadata.
//!
//! * [`ParquetMetaData`]: Top level metadata container, read from the Parquet
//!   file footer.
//!
//! * [`FileMetaData`]: File level metadata such as schema, row counts and
//!   version.
//!
//! * [`RowGroupMetaData`]: Metadata for each Row Group with a File, such as
//!   location and number of rows, and column chunks.
//!
//! * [`ColumnChunkMetaData`]: Metadata for each column chunk (primitive leaf)
//!   within a Row Group including encoding and compression information,
//!   number of values, statistics, etc.
//!
//! # APIs for working with Parquet Metadata
//!
//! The Parquet readers and writers in this crate handle reading and writing
//! metadata into parquet files. To work with metadata directly,
//! the following APIs are available:
//!
//! * [`ParquetMetaDataReader`] for reading metadata from an I/O source (sync and async)
//! * [`ParquetMetaDataPushDecoder`] for decoding from bytes without I/O
//! * [`ParquetMetaDataWriter`] for writing.
//!
//! # Examples
//!
//! Please see [`external_metadata.rs`]
//!
//! [`external_metadata.rs`]: https://github.com/apache/arrow-rs/tree/master/parquet/examples/external_metadata.rs
//!
//! # Metadata Encodings and Structures
//!
//! There are three different encodings of Parquet Metadata in this crate:
//!
//! 1. `bytes`:encoded with the Thrift `TCompactProtocol` as defined in
//!    [parquet.thrift]
//!
//! 2. [`format`]: Rust structures automatically generated by the thrift compiler
//!    from [parquet.thrift]. These structures are low level and mirror
//!    the thrift definitions.
//!
//! 3. [`file::metadata`] (this module): Easier to use Rust structures
//!    with a more idiomatic API. Note that, confusingly, some but not all
//!    of these structures have the same name as the [`format`] structures.
//!
//! [`file::metadata`]: crate::file::metadata
//! [parquet.thrift]:  https://github.com/apache/parquet-format/blob/master/src/main/thrift/parquet.thrift
//!
//! Graphically, this is how the different structures relate to each other:
//!
//! ```text
//!                          ┌ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─         ┌ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─
//!                            ┌──────────────┐     │         ┌───────────────────────┐ │
//!                          │ │ ColumnIndex  │              ││    ParquetMetaData    │
//!                            └──────────────┘     │         └───────────────────────┘ │
//! ┌──────────────┐         │ ┌────────────────┐            │┌───────────────────────┐
//! │   ..0x24..   │ ◀────▶    │  OffsetIndex   │   │ ◀────▶  │    ParquetMetaData    │ │
//! └──────────────┘         │ └────────────────┘            │└───────────────────────┘
//!                                     ...         │                   ...             │
//!                          │ ┌──────────────────┐          │ ┌──────────────────┐
//! bytes                      │  FileMetaData*   │ │          │  FileMetaData*   │     │
//! (thrift encoded)         │ └──────────────────┘          │ └──────────────────┘
//!                           ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ┘         ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ┘
//!
//!                          format::meta structures          file::metadata structures
//!
//!                         * Same name, different struct
//! ```
mod footer_tail;
mod memory;
mod options;
mod parser;
mod push_decoder;
pub(crate) mod reader;
pub(crate) mod thrift;
mod writer;

use crate::basic::{EncodingMask, PageType};
#[cfg(feature = "encryption")]
use crate::encryption::decrypt::FileDecryptor;
#[cfg(feature = "encryption")]
use crate::file::column_crypto_metadata::ColumnCryptoMetaData;
pub(crate) use crate::file::metadata::memory::HeapSize;
#[cfg(feature = "encryption")]
use crate::file::metadata::thrift::encryption::EncryptionAlgorithm;
use crate::file::page_index::column_index::{ByteArrayColumnIndex, PrimitiveColumnIndex};
use crate::file::page_index::{column_index::ColumnIndexMetaData, offset_index::PageLocation};
use crate::file::statistics::Statistics;
use crate::geospatial::statistics as geo_statistics;
use crate::schema::types::{
    ColumnDescPtr, ColumnDescriptor, ColumnPath, SchemaDescPtr, SchemaDescriptor,
    Type as SchemaType,
};
use crate::thrift_struct;
use crate::{
    basic::BoundaryOrder,
    errors::{ParquetError, Result},
};
use crate::{
    basic::{ColumnOrder, Compression, Encoding, Type},
    parquet_thrift::{
        ElementType, FieldType, ReadThrift, ThriftCompactInputProtocol,
        ThriftCompactOutputProtocol, WriteThrift, WriteThriftField,
    },
};
use crate::{
    data_type::private::ParquetValueType, file::page_index::offset_index::OffsetIndexMetaData,
};

pub use footer_tail::FooterTail;
pub use options::{ParquetMetaDataOptions, ParquetStatisticsPolicy};
pub use push_decoder::ParquetMetaDataPushDecoder;
pub use reader::{PageIndexPolicy, ParquetMetaDataReader};
use std::io::Write;
use std::ops::Range;
use std::sync::Arc;
pub use writer::ParquetMetaDataWriter;
pub(crate) use writer::ThriftMetadataWriter;

/// Page level statistics for each column chunk of each row group.
///
/// This structure is an in-memory representation of multiple [`ColumnIndex`]
/// structures in a parquet file footer, as described in the Parquet [PageIndex
/// documentation]. Each [`ColumnIndex`] holds statistics about all the pages in a
/// particular column chunk.
///
/// `column_index[row_group_number][column_number]` holds the
/// [`ColumnIndex`] corresponding to column `column_number` of row group
/// `row_group_number`.
///
/// For example `column_index[2][3]` holds the [`ColumnIndex`] for the fourth
/// column in the third row group of the parquet file.
///
/// [PageIndex documentation]: https://github.com/apache/parquet-format/blob/master/PageIndex.md
/// [`ColumnIndex`]: crate::file::page_index::column_index::ColumnIndexMetaData
pub type ParquetColumnIndex = Vec<Vec<ColumnIndexMetaData>>;

/// [`OffsetIndexMetaData`] for each data page of each row group of each column
///
/// This structure is the parsed representation of the [`OffsetIndex`] from the
/// Parquet file footer, as described in the Parquet [PageIndex documentation].
///
/// `offset_index[row_group_number][column_number]` holds
/// the [`OffsetIndexMetaData`] corresponding to column
/// `column_number`of row group `row_group_number`.
///
/// [PageIndex documentation]: https://github.com/apache/parquet-format/blob/master/PageIndex.md
/// [`OffsetIndex`]: https://github.com/apache/parquet-format/blob/master/PageIndex.md
pub type ParquetOffsetIndex = Vec<Vec<OffsetIndexMetaData>>;

/// Parsed metadata for a single Parquet file
///
/// This structure is stored in the footer of Parquet files, in the format
/// defined by [`parquet.thrift`].
///
/// # Overview
/// The fields of this structure are:
/// * [`FileMetaData`]: Information about the overall file (such as the schema) (See [`Self::file_metadata`])
/// * [`RowGroupMetaData`]: Information about each Row Group (see [`Self::row_groups`])
/// * [`ParquetColumnIndex`] and [`ParquetOffsetIndex`]: Optional "Page Index" structures (see [`Self::column_index`] and [`Self::offset_index`])
///
/// This structure is read by the various readers in this crate or can be read
/// directly from a file using the [`ParquetMetaDataReader`] struct.
///
/// See the [`ParquetMetaDataBuilder`] to create and modify this structure.
///
/// [`parquet.thrift`]: https://github.com/apache/parquet-format/blob/master/src/main/thrift/parquet.thrift
#[derive(Debug, Clone, PartialEq)]
pub struct ParquetMetaData {
    /// File level metadata
    file_metadata: FileMetaData,
    /// Row group metadata
    row_groups: Vec<RowGroupMetaData>,
    /// Page level index for each page in each column chunk
    column_index: Option<ParquetColumnIndex>,
    /// Offset index for each page in each column chunk
    offset_index: Option<ParquetOffsetIndex>,
    /// Optional file decryptor
    #[cfg(feature = "encryption")]
    file_decryptor: Option<Box<FileDecryptor>>,
}

impl ParquetMetaData {
    /// Creates Parquet metadata from file metadata and a list of row
    /// group metadata
    pub fn new(file_metadata: FileMetaData, row_groups: Vec<RowGroupMetaData>) -> Self {
        ParquetMetaData {
            file_metadata,
            row_groups,
            column_index: None,
            offset_index: None,
            #[cfg(feature = "encryption")]
            file_decryptor: None,
        }
    }

    /// Adds [`FileDecryptor`] to this metadata instance to enable decryption of
    /// encrypted data.
    #[cfg(feature = "encryption")]
    pub(crate) fn with_file_decryptor(&mut self, file_decryptor: Option<FileDecryptor>) {
        self.file_decryptor = file_decryptor.map(Box::new);
    }

    /// Convert this ParquetMetaData into a [`ParquetMetaDataBuilder`]
    pub fn into_builder(self) -> ParquetMetaDataBuilder {
        self.into()
    }

    /// Returns file metadata as reference.
    pub fn file_metadata(&self) -> &FileMetaData {
        &self.file_metadata
    }

    /// Returns file decryptor as reference.
    #[cfg(feature = "encryption")]
    pub(crate) fn file_decryptor(&self) -> Option<&FileDecryptor> {
        self.file_decryptor.as_deref()
    }

    /// Returns number of row groups in this file.
    pub fn num_row_groups(&self) -> usize {
        self.row_groups.len()
    }

    /// Returns row group metadata for `i`th position.
    /// Position should be less than number of row groups `num_row_groups`.
    pub fn row_group(&self, i: usize) -> &RowGroupMetaData {
        &self.row_groups[i]
    }

    /// Returns slice of row groups in this file.
    pub fn row_groups(&self) -> &[RowGroupMetaData] {
        &self.row_groups
    }

    /// Returns the column index for this file if loaded
    ///
    /// Returns `None` if the parquet file does not have a `ColumnIndex` or
    /// [ArrowReaderOptions::with_page_index] was set to false.
    ///
    /// [ArrowReaderOptions::with_page_index]: https://docs.rs/parquet/latest/parquet/arrow/arrow_reader/struct.ArrowReaderOptions.html#method.with_page_index
    pub fn column_index(&self) -> Option<&ParquetColumnIndex> {
        self.column_index.as_ref()
    }

    /// Returns offset indexes in this file, if loaded
    ///
    /// Returns `None` if the parquet file does not have a `OffsetIndex` or
    /// [ArrowReaderOptions::with_page_index] was set to false.
    ///
    /// [ArrowReaderOptions::with_page_index]: https://docs.rs/parquet/latest/parquet/arrow/arrow_reader/struct.ArrowReaderOptions.html#method.with_page_index
    pub fn offset_index(&self) -> Option<&ParquetOffsetIndex> {
        self.offset_index.as_ref()
    }

    /// Estimate of the bytes allocated to store `ParquetMetadata`
    ///
    /// # Notes:
    ///
    /// 1. Includes size of self
    ///
    /// 2. Includes heap memory for sub fields such as [`FileMetaData`] and
    ///    [`RowGroupMetaData`].
    ///
    /// 3. Includes memory from shared pointers (e.g. [`SchemaDescPtr`]). This
    ///    means `memory_size` will over estimate the memory size if such pointers
    ///    are shared.
    ///
    /// 4. Does not include any allocator overheads
    pub fn memory_size(&self) -> usize {
        #[cfg(feature = "encryption")]
        let encryption_size = self.file_decryptor.heap_size();
        #[cfg(not(feature = "encryption"))]
        let encryption_size = 0usize;

        std::mem::size_of::<Self>()
            + self.file_metadata.heap_size()
            + self.row_groups.heap_size()
            + self.column_index.heap_size()
            + self.offset_index.heap_size()
            + encryption_size
    }

    /// Override the column index
    pub(crate) fn set_column_index(&mut self, index: Option<ParquetColumnIndex>) {
        self.column_index = index;
    }

    /// Override the offset index
    pub(crate) fn set_offset_index(&mut self, index: Option<ParquetOffsetIndex>) {
        self.offset_index = index;
    }
}

/// A builder for creating / manipulating [`ParquetMetaData`]
///
/// # Example creating a new [`ParquetMetaData`]
///
///```no_run
/// # use parquet::file::metadata::{FileMetaData, ParquetMetaData, ParquetMetaDataBuilder, RowGroupMetaData, RowGroupMetaDataBuilder};
/// # fn get_file_metadata() -> FileMetaData { unimplemented!(); }
/// // Create a new builder given the file metadata
/// let file_metadata = get_file_metadata();
/// // Create a row group
/// let row_group = RowGroupMetaData::builder(file_metadata.schema_descr_ptr())
///    .set_num_rows(100)
///    // ... (A real row group needs more than just the number of rows)
///    .build()
///    .unwrap();
/// // Create the final metadata
/// let metadata: ParquetMetaData = ParquetMetaDataBuilder::new(file_metadata)
///   .add_row_group(row_group)
///   .build();
/// ```
///
/// # Example modifying an existing [`ParquetMetaData`]
/// ```no_run
/// # use parquet::file::metadata::ParquetMetaData;
/// # fn load_metadata() -> ParquetMetaData { unimplemented!(); }
/// // Modify the metadata so only the last RowGroup remains
/// let metadata: ParquetMetaData = load_metadata();
/// let mut builder = metadata.into_builder();
///
/// // Take existing row groups to modify
/// let mut row_groups = builder.take_row_groups();
/// let last_row_group = row_groups.pop().unwrap();
///
/// let metadata = builder
///   .add_row_group(last_row_group)
///   .build();
/// ```
pub struct ParquetMetaDataBuilder(ParquetMetaData);

impl ParquetMetaDataBuilder {
    /// Create a new builder from a file metadata, with no row groups
    pub fn new(file_meta_data: FileMetaData) -> Self {
        Self(ParquetMetaData::new(file_meta_data, vec![]))
    }

    /// Create a new builder from an existing ParquetMetaData
    pub fn new_from_metadata(metadata: ParquetMetaData) -> Self {
        Self(metadata)
    }

    /// Adds a row group to the metadata
    pub fn add_row_group(mut self, row_group: RowGroupMetaData) -> Self {
        self.0.row_groups.push(row_group);
        self
    }

    /// Sets all the row groups to the specified list
    pub fn set_row_groups(mut self, row_groups: Vec<RowGroupMetaData>) -> Self {
        self.0.row_groups = row_groups;
        self
    }

    /// Takes ownership of the row groups in this builder, and clears the list
    /// of row groups.
    ///
    /// This can be used for more efficient creation of a new ParquetMetaData
    /// from an existing one.
    pub fn take_row_groups(&mut self) -> Vec<RowGroupMetaData> {
        std::mem::take(&mut self.0.row_groups)
    }

    /// Return a reference to the current row groups
    pub fn row_groups(&self) -> &[RowGroupMetaData] {
        &self.0.row_groups
    }

    /// Sets the column index
    pub fn set_column_index(mut self, column_index: Option<ParquetColumnIndex>) -> Self {
        self.0.column_index = column_index;
        self
    }

    /// Returns the current column index from the builder, replacing it with `None`
    pub fn take_column_index(&mut self) -> Option<ParquetColumnIndex> {
        std::mem::take(&mut self.0.column_index)
    }

    /// Return a reference to the current column index, if any
    pub fn column_index(&self) -> Option<&ParquetColumnIndex> {
        self.0.column_index.as_ref()
    }

    /// Sets the offset index
    pub fn set_offset_index(mut self, offset_index: Option<ParquetOffsetIndex>) -> Self {
        self.0.offset_index = offset_index;
        self
    }

    /// Returns the current offset index from the builder, replacing it with `None`
    pub fn take_offset_index(&mut self) -> Option<ParquetOffsetIndex> {
        std::mem::take(&mut self.0.offset_index)
    }

    /// Return a reference to the current offset index, if any
    pub fn offset_index(&self) -> Option<&ParquetOffsetIndex> {
        self.0.offset_index.as_ref()
    }

    /// Sets the file decryptor needed to decrypt this metadata.
    #[cfg(feature = "encryption")]
    pub(crate) fn set_file_decryptor(mut self, file_decryptor: Option<FileDecryptor>) -> Self {
        self.0.with_file_decryptor(file_decryptor);
        self
    }

    /// Creates a new ParquetMetaData from the builder
    pub fn build(self) -> ParquetMetaData {
        let Self(metadata) = self;
        metadata
    }
}

impl From<ParquetMetaData> for ParquetMetaDataBuilder {
    fn from(meta_data: ParquetMetaData) -> Self {
        Self(meta_data)
    }
}

thrift_struct!(
/// A key-value pair for [`FileMetaData`].
pub struct KeyValue {
  1: required string key
  2: optional string value
}
);

impl KeyValue {
    /// Create a new key value pair
    pub fn new<F2>(key: String, value: F2) -> KeyValue
    where
        F2: Into<Option<String>>,
    {
        KeyValue {
            key,
            value: value.into(),
        }
    }
}

thrift_struct!(
/// PageEncodingStats for a column chunk and data page.
pub struct PageEncodingStats {
  1: required PageType page_type;
  2: required Encoding encoding;
  3: required i32 count;
}
);

/// Internal representation of the page encoding stats in the [`ColumnChunkMetaData`].
/// This is not publicly exposed, with different getters defined for each variant.
#[derive(Debug, Clone, PartialEq)]
enum ParquetPageEncodingStats {
    /// The full array of stats as defined in the Parquet spec.
    Full(Vec<PageEncodingStats>),
    /// A condensed version of only page encodings seen.
    Mask(EncodingMask),
}

/// Reference counted pointer for [`FileMetaData`].
pub type FileMetaDataPtr = Arc<FileMetaData>;

/// File level metadata for a Parquet file.
///
/// Includes the version of the file, metadata, number of rows, schema, and column orders
#[derive(Debug, Clone, PartialEq)]
pub struct FileMetaData {
    version: i32,
    num_rows: i64,
    created_by: Option<String>,
    key_value_metadata: Option<Vec<KeyValue>>,
    schema_descr: SchemaDescPtr,
    column_orders: Option<Vec<ColumnOrder>>,
    #[cfg(feature = "encryption")]
    encryption_algorithm: Option<Box<EncryptionAlgorithm>>,
    #[cfg(feature = "encryption")]
    footer_signing_key_metadata: Option<Vec<u8>>,
}

impl FileMetaData {
    /// Creates new file metadata.
    pub fn new(
        version: i32,
        num_rows: i64,
        created_by: Option<String>,
        key_value_metadata: Option<Vec<KeyValue>>,
        schema_descr: SchemaDescPtr,
        column_orders: Option<Vec<ColumnOrder>>,
    ) -> Self {
        FileMetaData {
            version,
            num_rows,
            created_by,
            key_value_metadata,
            schema_descr,
            column_orders,
            #[cfg(feature = "encryption")]
            encryption_algorithm: None,
            #[cfg(feature = "encryption")]
            footer_signing_key_metadata: None,
        }
    }

    #[cfg(feature = "encryption")]
    pub(crate) fn with_encryption_algorithm(
        mut self,
        encryption_algorithm: Option<EncryptionAlgorithm>,
    ) -> Self {
        self.encryption_algorithm = encryption_algorithm.map(Box::new);
        self
    }

    #[cfg(feature = "encryption")]
    pub(crate) fn with_footer_signing_key_metadata(
        mut self,
        footer_signing_key_metadata: Option<Vec<u8>>,
    ) -> Self {
        self.footer_signing_key_metadata = footer_signing_key_metadata;
        self
    }

    /// Returns version of this file.
    pub fn version(&self) -> i32 {
        self.version
    }

    /// Returns number of rows in the file.
    pub fn num_rows(&self) -> i64 {
        self.num_rows
    }

    /// String message for application that wrote this file.
    ///
    /// This should have the following format:
    /// `<application> version <application version> (build <application build hash>)`.
    ///
    /// ```shell
    /// parquet-mr version 1.8.0 (build 0fda28af84b9746396014ad6a415b90592a98b3b)
    /// ```
    pub fn created_by(&self) -> Option<&str> {
        self.created_by.as_deref()
    }

    /// Returns key_value_metadata of this file.
    pub fn key_value_metadata(&self) -> Option<&Vec<KeyValue>> {
        self.key_value_metadata.as_ref()
    }

    /// Returns Parquet [`Type`] that describes schema in this file.
    ///
    /// [`Type`]: crate::schema::types::Type
    pub fn schema(&self) -> &SchemaType {
        self.schema_descr.root_schema()
    }

    /// Returns a reference to schema descriptor.
    pub fn schema_descr(&self) -> &SchemaDescriptor {
        &self.schema_descr
    }

    /// Returns reference counted clone for schema descriptor.
    pub fn schema_descr_ptr(&self) -> SchemaDescPtr {
        self.schema_descr.clone()
    }

    /// Column (sort) order used for `min` and `max` values of each column in this file.
    ///
    /// Each column order corresponds to one column, determined by its position in the
    /// list, matching the position of the column in the schema.
    ///
    /// When `None` is returned, there are no column orders available, and each column
    /// should be assumed to have undefined (legacy) column order.
    pub fn column_orders(&self) -> Option<&Vec<ColumnOrder>> {
        self.column_orders.as_ref()
    }

    /// Returns column order for `i`th column in this file.
    /// If column orders are not available, returns undefined (legacy) column order.
    pub fn column_order(&self, i: usize) -> ColumnOrder {
        self.column_orders
            .as_ref()
            .map(|data| data[i])
            .unwrap_or(ColumnOrder::UNDEFINED)
    }
}

thrift_struct!(
/// Sort order within a RowGroup of a leaf column
pub struct SortingColumn {
  /// The ordinal position of the column (in this row group)
  1: required i32 column_idx

  /// If true, indicates this column is sorted in descending order.
  2: required bool descending

  /// If true, nulls will come before non-null values, otherwise,
  /// nulls go at the end. */
  3: required bool nulls_first
}
);

/// Reference counted pointer for [`RowGroupMetaData`].
pub type RowGroupMetaDataPtr = Arc<RowGroupMetaData>;

/// Metadata for a row group
///
/// Includes [`ColumnChunkMetaData`] for each column in the row group, the number of rows
/// the total byte size of the row group, and the [`SchemaDescriptor`] for the row group.
#[derive(Debug, Clone, PartialEq)]
pub struct RowGroupMetaData {
    columns: Vec<ColumnChunkMetaData>,
    num_rows: i64,
    sorting_columns: Option<Vec<SortingColumn>>,
    total_byte_size: i64,
    schema_descr: SchemaDescPtr,
    /// We can't infer from file offset of first column since there may empty columns in row group.
    file_offset: Option<i64>,
    /// Ordinal position of this row group in file
    ordinal: Option<i16>,
}

impl RowGroupMetaData {
    /// Returns builder for row group metadata.
    pub fn builder(schema_descr: SchemaDescPtr) -> RowGroupMetaDataBuilder {
        RowGroupMetaDataBuilder::new(schema_descr)
    }

    /// Number of columns in this row group.
    pub fn num_columns(&self) -> usize {
        self.columns.len()
    }

    /// Returns column chunk metadata for `i`th column.
    pub fn column(&self, i: usize) -> &ColumnChunkMetaData {
        &self.columns[i]
    }

    /// Returns slice of column chunk metadata.
    pub fn columns(&self) -> &[ColumnChunkMetaData] {
        &self.columns
    }

    /// Returns mutable slice of column chunk metadata.
    pub fn columns_mut(&mut self) -> &mut [ColumnChunkMetaData] {
        &mut self.columns
    }

    /// Number of rows in this row group.
    pub fn num_rows(&self) -> i64 {
        self.num_rows
    }

    /// Returns the sort ordering of the rows in this RowGroup if any
    pub fn sorting_columns(&self) -> Option<&Vec<SortingColumn>> {
        self.sorting_columns.as_ref()
    }

    /// Total byte size of all uncompressed column data in this row group.
    pub fn total_byte_size(&self) -> i64 {
        self.total_byte_size
    }

    /// Total size of all compressed column data in this row group.
    pub fn compressed_size(&self) -> i64 {
        self.columns.iter().map(|c| c.total_compressed_size).sum()
    }

    /// Returns reference to a schema descriptor.
    pub fn schema_descr(&self) -> &SchemaDescriptor {
        self.schema_descr.as_ref()
    }

    /// Returns reference counted clone of schema descriptor.
    pub fn schema_descr_ptr(&self) -> SchemaDescPtr {
        self.schema_descr.clone()
    }

    /// Returns ordinal position of this row group in file.
    ///
    /// For example if this is the first row group in the file, this will return 0.
    /// If this is the second row group in the file, this will return 1.
    #[inline(always)]
    pub fn ordinal(&self) -> Option<i16> {
        self.ordinal
    }

    /// Returns file offset of this row group in file.
    #[inline(always)]
    pub fn file_offset(&self) -> Option<i64> {
        self.file_offset
    }

    /// Converts this [`RowGroupMetaData`] into a [`RowGroupMetaDataBuilder`]
    pub fn into_builder(self) -> RowGroupMetaDataBuilder {
        RowGroupMetaDataBuilder(self)
    }
}

/// Builder for row group metadata.
pub struct RowGroupMetaDataBuilder(RowGroupMetaData);

impl RowGroupMetaDataBuilder {
    /// Creates new builder from schema descriptor.
    fn new(schema_descr: SchemaDescPtr) -> Self {
        Self(RowGroupMetaData {
            columns: Vec::with_capacity(schema_descr.num_columns()),
            schema_descr,
            file_offset: None,
            num_rows: 0,
            sorting_columns: None,
            total_byte_size: 0,
            ordinal: None,
        })
    }

    /// Sets number of rows in this row group.
    pub fn set_num_rows(mut self, value: i64) -> Self {
        self.0.num_rows = value;
        self
    }

    /// Sets the sorting order for columns
    pub fn set_sorting_columns(mut self, value: Option<Vec<SortingColumn>>) -> Self {
        self.0.sorting_columns = value;
        self
    }

    /// Sets total size in bytes for this row group.
    pub fn set_total_byte_size(mut self, value: i64) -> Self {
        self.0.total_byte_size = value;
        self
    }

    /// Takes ownership of the the column metadata in this builder, and clears
    /// the list of columns.
    ///
    /// This can be used for more efficient creation of a new RowGroupMetaData
    /// from an existing one.
    pub fn take_columns(&mut self) -> Vec<ColumnChunkMetaData> {
        std::mem::take(&mut self.0.columns)
    }

    /// Sets column metadata for this row group.
    pub fn set_column_metadata(mut self, value: Vec<ColumnChunkMetaData>) -> Self {
        self.0.columns = value;
        self
    }

    /// Adds a column metadata to this row group
    pub fn add_column_metadata(mut self, value: ColumnChunkMetaData) -> Self {
        self.0.columns.push(value);
        self
    }

    /// Sets ordinal for this row group.
    pub fn set_ordinal(mut self, value: i16) -> Self {
        self.0.ordinal = Some(value);
        self
    }

    /// Sets file offset for this row group.
    pub fn set_file_offset(mut self, value: i64) -> Self {
        self.0.file_offset = Some(value);
        self
    }

    /// Builds row group metadata.
    pub fn build(self) -> Result<RowGroupMetaData> {
        if self.0.schema_descr.num_columns() != self.0.columns.len() {
            return Err(general_err!(
                "Column length mismatch: {} != {}",
                self.0.schema_descr.num_columns(),
                self.0.columns.len()
            ));
        }

        Ok(self.0)
    }

    /// Build row group metadata without validation.
    pub(super) fn build_unchecked(self) -> RowGroupMetaData {
        self.0
    }
}

/// Metadata for a column chunk.
#[derive(Debug, Clone, PartialEq)]
pub struct ColumnChunkMetaData {
    column_descr: ColumnDescPtr,
    encodings: EncodingMask,
    file_path: Option<String>,
    file_offset: i64,
    num_values: i64,
    compression: Compression,
    total_compressed_size: i64,
    total_uncompressed_size: i64,
    data_page_offset: i64,
    index_page_offset: Option<i64>,
    dictionary_page_offset: Option<i64>,
    statistics: Option<Statistics>,
    geo_statistics: Option<Box<geo_statistics::GeospatialStatistics>>,
    encoding_stats: Option<ParquetPageEncodingStats>,
    bloom_filter_offset: Option<i64>,
    bloom_filter_length: Option<i32>,
    offset_index_offset: Option<i64>,
    offset_index_length: Option<i32>,
    column_index_offset: Option<i64>,
    column_index_length: Option<i32>,
    unencoded_byte_array_data_bytes: Option<i64>,
    repetition_level_histogram: Option<LevelHistogram>,
    definition_level_histogram: Option<LevelHistogram>,
    #[cfg(feature = "encryption")]
    column_crypto_metadata: Option<Box<ColumnCryptoMetaData>>,
    #[cfg(feature = "encryption")]
    encrypted_column_metadata: Option<Vec<u8>>,
}

/// Histograms for repetition and definition levels.
///
/// Each histogram is a vector of length `max_level + 1`. The value at index `i` is the number of
/// values at level `i`.
///
/// For example, `vec[0]` is the number of rows with level 0, `vec[1]` is the
/// number of rows with level 1, and so on.
///
#[derive(Debug, Clone, PartialEq, Eq, Hash, Default)]
pub struct LevelHistogram {
    inner: Vec<i64>,
}

impl LevelHistogram {
    /// Creates a new level histogram data.
    ///
    /// Length will be `max_level + 1`.
    ///
    /// Returns `None` when `max_level == 0` (because histograms are not necessary in this case)
    pub fn try_new(max_level: i16) -> Option<Self> {
        if max_level > 0 {
            Some(Self {
                inner: vec![0; max_level as usize + 1],
            })
        } else {
            None
        }
    }
    /// Returns a reference to the the histogram's values.
    pub fn values(&self) -> &[i64] {
        &self.inner
    }

    /// Return the inner vector, consuming self
    pub fn into_inner(self) -> Vec<i64> {
        self.inner
    }

    /// Returns the histogram value at the given index.
    ///
    /// The value of `i` is the number of values with level `i`. For example,
    /// `get(1)` returns the number of values with level 1.
    ///
    /// Returns `None` if the index is out of bounds.
    pub fn get(&self, index: usize) -> Option<i64> {
        self.inner.get(index).copied()
    }

    /// Adds the values from the other histogram to this histogram
    ///
    /// # Panics
    /// If the histograms have different lengths
    pub fn add(&mut self, other: &Self) {
        assert_eq!(self.len(), other.len());
        for (dst, src) in self.inner.iter_mut().zip(other.inner.iter()) {
            *dst += src;
        }
    }

    /// return the length of the histogram
    pub fn len(&self) -> usize {
        self.inner.len()
    }

    /// returns if the histogram is empty
    pub fn is_empty(&self) -> bool {
        self.inner.is_empty()
    }

    /// Sets the values of all histogram levels to 0.
    pub fn reset(&mut self) {
        for value in self.inner.iter_mut() {
            *value = 0;
        }
    }

    /// Updates histogram values using provided repetition levels
    ///
    /// # Panics
    /// if any of the levels is greater than the length of the histogram (
    /// the argument supplied to [`Self::try_new`])
    pub fn update_from_levels(&mut self, levels: &[i16]) {
        for &level in levels {
            self.inner[level as usize] += 1;
        }
    }
}

impl From<Vec<i64>> for LevelHistogram {
    fn from(inner: Vec<i64>) -> Self {
        Self { inner }
    }
}

impl From<LevelHistogram> for Vec<i64> {
    fn from(value: LevelHistogram) -> Self {
        value.into_inner()
    }
}

impl HeapSize for LevelHistogram {
    fn heap_size(&self) -> usize {
        self.inner.heap_size()
    }
}

/// Represents common operations for a column chunk.
impl ColumnChunkMetaData {
    /// Returns builder for column chunk metadata.
    pub fn builder(column_descr: ColumnDescPtr) -> ColumnChunkMetaDataBuilder {
        ColumnChunkMetaDataBuilder::new(column_descr)
    }

    /// File where the column chunk is stored.
    ///
    /// If not set, assumed to belong to the same file as the metadata.
    /// This path is relative to the current file.
    pub fn file_path(&self) -> Option<&str> {
        self.file_path.as_deref()
    }

    /// Byte offset of `ColumnMetaData` in `file_path()`.
    ///
    /// Note that the meaning of this field has been inconsistent between implementations
    /// so its use has since been deprecated in the Parquet specification. Modern implementations
    /// will set this to `0` to indicate that the `ColumnMetaData` is solely contained in the
    /// `ColumnChunk` struct.
    pub fn file_offset(&self) -> i64 {
        self.file_offset
    }

    /// Type of this column. Must be primitive.
    pub fn column_type(&self) -> Type {
        self.column_descr.physical_type()
    }

    /// Path (or identifier) of this column.
    pub fn column_path(&self) -> &ColumnPath {
        self.column_descr.path()
    }

    /// Descriptor for this column.
    pub fn column_descr(&self) -> &ColumnDescriptor {
        self.column_descr.as_ref()
    }

    /// Reference counted clone of descriptor for this column.
    pub fn column_descr_ptr(&self) -> ColumnDescPtr {
        self.column_descr.clone()
    }

    /// All encodings used for this column.
    pub fn encodings(&self) -> impl Iterator<Item = Encoding> {
        self.encodings.encodings()
    }

    /// All encodings used for this column, returned as a bitmask.
    pub fn encodings_mask(&self) -> &EncodingMask {
        &self.encodings
    }

    /// Total number of values in this column chunk.
    pub fn num_values(&self) -> i64 {
        self.num_values
    }

    /// Compression for this column.
    pub fn compression(&self) -> Compression {
        self.compression
    }

    /// Returns the total compressed data size of this column chunk.
    pub fn compressed_size(&self) -> i64 {
        self.total_compressed_size
    }

    /// Returns the total uncompressed data size of this column chunk.
    pub fn uncompressed_size(&self) -> i64 {
        self.total_uncompressed_size
    }

    /// Returns the offset for the column data.
    pub fn data_page_offset(&self) -> i64 {
        self.data_page_offset
    }

    /// Returns the offset for the index page.
    pub fn index_page_offset(&self) -> Option<i64> {
        self.index_page_offset
    }

    /// Returns the offset for the dictionary page, if any.
    pub fn dictionary_page_offset(&self) -> Option<i64> {
        self.dictionary_page_offset
    }

    /// Returns the offset and length in bytes of the column chunk within the file
    pub fn byte_range(&self) -> (u64, u64) {
        let col_start = match self.dictionary_page_offset() {
            Some(dictionary_page_offset) => dictionary_page_offset,
            None => self.data_page_offset(),
        };
        let col_len = self.compressed_size();
        assert!(
            col_start >= 0 && col_len >= 0,
            "column start and length should not be negative"
        );
        (col_start as u64, col_len as u64)
    }

    /// Returns statistics that are set for this column chunk,
    /// or `None` if no statistics are available.
    pub fn statistics(&self) -> Option<&Statistics> {
        self.statistics.as_ref()
    }

    /// Returns geospatial statistics that are set for this column chunk,
    /// or `None` if no geospatial statistics are available.
    pub fn geo_statistics(&self) -> Option<&geo_statistics::GeospatialStatistics> {
        self.geo_statistics.as_deref()
    }

    /// Returns the page encoding statistics, or `None` if no page encoding statistics
    /// are available (or they were converted to a mask).
    pub fn page_encoding_stats(&self) -> Option<&Vec<PageEncodingStats>> {
        match self.encoding_stats.as_ref() {
            Some(ParquetPageEncodingStats::Full(stats)) => Some(stats),
            _ => None,
        }
    }

    /// Returns the page encoding statistics reduced to a bitmask, or `None` if statistics are
    /// not available (or they were left in their original form).
    ///
    /// The [`PageEncodingStats`] struct was added to the Parquet specification specifically to
    /// enable fast determination of whether all pages in a column chunk are dictionary encoded
    /// (see <https://github.com/apache/parquet-format/pull/16>).
    /// Decoding the full page encoding statistics, however, can be very costly, and is not
    /// necessary to support the aforementioned use case. As an alternative, this crate can
    /// instead distill the list of `PageEncodingStats` down to a bitmask of just the encodings
    /// used for data pages
    /// (see [`ParquetMetaDataOptions::set_encoding_stats_as_mask`]).
    /// To test for an all-dictionary-encoded chunk one could use this bitmask in the following way:
    ///
    /// ```rust
    /// use parquet::basic::Encoding;
    /// use parquet::file::metadata::ColumnChunkMetaData;
    /// // test if all data pages in the column chunk are dictionary encoded
    /// fn is_all_dictionary_encoded(col_meta: &ColumnChunkMetaData) -> bool {
    ///     // check that dictionary encoding was used
    ///     col_meta.dictionary_page_offset().is_some()
    ///         && col_meta.page_encoding_stats_mask().is_some_and(|mask| {
    ///             // mask should only have one bit set, either for PLAIN_DICTIONARY or
    ///             // RLE_DICTIONARY
    ///             mask.is_only(Encoding::PLAIN_DICTIONARY) || mask.is_only(Encoding::RLE_DICTIONARY)
    ///         })
    /// }
    /// ```
    pub fn page_encoding_stats_mask(&self) -> Option<&EncodingMask> {
        match self.encoding_stats.as_ref() {
            Some(ParquetPageEncodingStats::Mask(stats)) => Some(stats),
            _ => None,
        }
    }

    /// Returns the offset for the bloom filter.
    pub fn bloom_filter_offset(&self) -> Option<i64> {
        self.bloom_filter_offset
    }

    /// Returns the offset for the bloom filter.
    pub fn bloom_filter_length(&self) -> Option<i32> {
        self.bloom_filter_length
    }

    /// Returns the offset for the column index.
    pub fn column_index_offset(&self) -> Option<i64> {
        self.column_index_offset
    }

    /// Returns the offset for the column index length.
    pub fn column_index_length(&self) -> Option<i32> {
        self.column_index_length
    }

    /// Returns the range for the offset index if any
    pub(crate) fn column_index_range(&self) -> Option<Range<u64>> {
        let offset = u64::try_from(self.column_index_offset?).ok()?;
        let length = u64::try_from(self.column_index_length?).ok()?;
        Some(offset..(offset + length))
    }

    /// Returns the offset for the offset index.
    pub fn offset_index_offset(&self) -> Option<i64> {
        self.offset_index_offset
    }

    /// Returns the offset for the offset index length.
    pub fn offset_index_length(&self) -> Option<i32> {
        self.offset_index_length
    }

    /// Returns the range for the offset index if any
    pub(crate) fn offset_index_range(&self) -> Option<Range<u64>> {
        let offset = u64::try_from(self.offset_index_offset?).ok()?;
        let length = u64::try_from(self.offset_index_length?).ok()?;
        Some(offset..(offset + length))
    }

    /// Returns the number of bytes of variable length data after decoding.
    ///
    /// Only set for BYTE_ARRAY columns. This field may not be set by older
    /// writers.
    pub fn unencoded_byte_array_data_bytes(&self) -> Option<i64> {
        self.unencoded_byte_array_data_bytes
    }

    /// Returns the repetition level histogram.
    ///
    /// The returned value `vec[i]` is how many values are at repetition level `i`. For example,
    /// `vec[0]` indicates how many rows the page contains.
    /// This field may not be set by older writers.
    pub fn repetition_level_histogram(&self) -> Option<&LevelHistogram> {
        self.repetition_level_histogram.as_ref()
    }

    /// Returns the definition level histogram.
    ///
    /// The returned value `vec[i]` is how many values are at definition level `i`. For example,
    /// `vec[max_definition_level]` indicates how many non-null values are present in the page.
    /// This field may not be set by older writers.
    pub fn definition_level_histogram(&self) -> Option<&LevelHistogram> {
        self.definition_level_histogram.as_ref()
    }

    /// Returns the encryption metadata for this column chunk.
    #[cfg(feature = "encryption")]
    pub fn crypto_metadata(&self) -> Option<&ColumnCryptoMetaData> {
        self.column_crypto_metadata.as_deref()
    }

    /// Converts this [`ColumnChunkMetaData`] into a [`ColumnChunkMetaDataBuilder`]
    pub fn into_builder(self) -> ColumnChunkMetaDataBuilder {
        ColumnChunkMetaDataBuilder::from(self)
    }
}

/// Builder for [`ColumnChunkMetaData`]
///
/// This builder is used to create a new column chunk metadata or modify an
/// existing one.
///
/// # Example
/// ```no_run
/// # use parquet::file::metadata::{ColumnChunkMetaData, ColumnChunkMetaDataBuilder};
/// # fn get_column_chunk_metadata() -> ColumnChunkMetaData { unimplemented!(); }
/// let column_chunk_metadata = get_column_chunk_metadata();
/// // create a new builder from existing column chunk metadata
/// let builder = ColumnChunkMetaDataBuilder::from(column_chunk_metadata);
/// // clear the statistics:
/// let column_chunk_metadata: ColumnChunkMetaData = builder
///   .clear_statistics()
///   .build()
///   .unwrap();
/// ```
pub struct ColumnChunkMetaDataBuilder(ColumnChunkMetaData);

impl ColumnChunkMetaDataBuilder {
    /// Creates new column chunk metadata builder.
    ///
    /// See also [`ColumnChunkMetaData::builder`]
    fn new(column_descr: ColumnDescPtr) -> Self {
        Self(ColumnChunkMetaData {
            column_descr,
            encodings: Default::default(),
            file_path: None,
            file_offset: 0,
            num_values: 0,
            compression: Compression::UNCOMPRESSED,
            total_compressed_size: 0,
            total_uncompressed_size: 0,
            data_page_offset: 0,
            index_page_offset: None,
            dictionary_page_offset: None,
            statistics: None,
            geo_statistics: None,
            encoding_stats: None,
            bloom_filter_offset: None,
            bloom_filter_length: None,
            offset_index_offset: None,
            offset_index_length: None,
            column_index_offset: None,
            column_index_length: None,
            unencoded_byte_array_data_bytes: None,
            repetition_level_histogram: None,
            definition_level_histogram: None,
            #[cfg(feature = "encryption")]
            column_crypto_metadata: None,
            #[cfg(feature = "encryption")]
            encrypted_column_metadata: None,
        })
    }

    /// Sets list of encodings for this column chunk.
    pub fn set_encodings(mut self, encodings: Vec<Encoding>) -> Self {
        self.0.encodings = EncodingMask::new_from_encodings(encodings.iter());
        self
    }

    /// Sets the encodings mask for this column chunk.
    pub fn set_encodings_mask(mut self, encodings: EncodingMask) -> Self {
        self.0.encodings = encodings;
        self
    }

    /// Sets optional file path for this column chunk.
    pub fn set_file_path(mut self, value: String) -> Self {
        self.0.file_path = Some(value);
        self
    }

    /// Sets number of values.
    pub fn set_num_values(mut self, value: i64) -> Self {
        self.0.num_values = value;
        self
    }

    /// Sets compression.
    pub fn set_compression(mut self, value: Compression) -> Self {
        self.0.compression = value;
        self
    }

    /// Sets total compressed size in bytes.
    pub fn set_total_compressed_size(mut self, value: i64) -> Self {
        self.0.total_compressed_size = value;
        self
    }

    /// Sets total uncompressed size in bytes.
    pub fn set_total_uncompressed_size(mut self, value: i64) -> Self {
        self.0.total_uncompressed_size = value;
        self
    }

    /// Sets data page offset in bytes.
    pub fn set_data_page_offset(mut self, value: i64) -> Self {
        self.0.data_page_offset = value;
        self
    }

    /// Sets optional dictionary page offset in bytes.
    pub fn set_dictionary_page_offset(mut self, value: Option<i64>) -> Self {
        self.0.dictionary_page_offset = value;
        self
    }

    /// Sets optional index page offset in bytes.
    pub fn set_index_page_offset(mut self, value: Option<i64>) -> Self {
        self.0.index_page_offset = value;
        self
    }

    /// Sets statistics for this column chunk.
    pub fn set_statistics(mut self, value: Statistics) -> Self {
        self.0.statistics = Some(value);
        self
    }

    /// Sets geospatial statistics for this column chunk.
    pub fn set_geo_statistics(mut self, value: Box<geo_statistics::GeospatialStatistics>) -> Self {
        self.0.geo_statistics = Some(value);
        self
    }

    /// Clears the statistics for this column chunk.
    pub fn clear_statistics(mut self) -> Self {
        self.0.statistics = None;
        self
    }

    /// Sets page encoding stats for this column chunk.
    ///
    /// This will overwrite any existing stats, either `Vec` based or bitmask.
    pub fn set_page_encoding_stats(mut self, value: Vec<PageEncodingStats>) -> Self {
        self.0.encoding_stats = Some(ParquetPageEncodingStats::Full(value));
        self
    }

    /// Sets page encoding stats mask for this column chunk.
    ///
    /// This will overwrite any existing stats, either `Vec` based or bitmask.
    pub fn set_page_encoding_stats_mask(mut self, value: EncodingMask) -> Self {
        self.0.encoding_stats = Some(ParquetPageEncodingStats::Mask(value));
        self
    }

    /// Clears the page encoding stats for this column chunk.
    pub fn clear_page_encoding_stats(mut self) -> Self {
        self.0.encoding_stats = None;
        self
    }

    /// Sets optional bloom filter offset in bytes.
    pub fn set_bloom_filter_offset(mut self, value: Option<i64>) -> Self {
        self.0.bloom_filter_offset = value;
        self
    }

    /// Sets optional bloom filter length in bytes.
    pub fn set_bloom_filter_length(mut self, value: Option<i32>) -> Self {
        self.0.bloom_filter_length = value;
        self
    }

    /// Sets optional offset index offset in bytes.
    pub fn set_offset_index_offset(mut self, value: Option<i64>) -> Self {
        self.0.offset_index_offset = value;
        self
    }

    /// Sets optional offset index length in bytes.
    pub fn set_offset_index_length(mut self, value: Option<i32>) -> Self {
        self.0.offset_index_length = value;
        self
    }

    /// Sets optional column index offset in bytes.
    pub fn set_column_index_offset(mut self, value: Option<i64>) -> Self {
        self.0.column_index_offset = value;
        self
    }

    /// Sets optional column index length in bytes.
    pub fn set_column_index_length(mut self, value: Option<i32>) -> Self {
        self.0.column_index_length = value;
        self
    }

    /// Sets optional length of variable length data in bytes.
    pub fn set_unencoded_byte_array_data_bytes(mut self, value: Option<i64>) -> Self {
        self.0.unencoded_byte_array_data_bytes = value;
        self
    }

    /// Sets optional repetition level histogram
    pub fn set_repetition_level_histogram(mut self, value: Option<LevelHistogram>) -> Self {
        self.0.repetition_level_histogram = value;
        self
    }

    /// Sets optional repetition level histogram
    pub fn set_definition_level_histogram(mut self, value: Option<LevelHistogram>) -> Self {
        self.0.definition_level_histogram = value;
        self
    }

    #[cfg(feature = "encryption")]
    /// Set the encryption metadata for an encrypted column
    pub fn set_column_crypto_metadata(mut self, value: Option<ColumnCryptoMetaData>) -> Self {
        self.0.column_crypto_metadata = value.map(Box::new);
        self
    }

    #[cfg(feature = "encryption")]
    /// Set the encryption metadata for an encrypted column
    pub fn set_encrypted_column_metadata(mut self, value: Option<Vec<u8>>) -> Self {
        self.0.encrypted_column_metadata = value;
        self
    }

    /// Builds column chunk metadata.
    pub fn build(self) -> Result<ColumnChunkMetaData> {
        Ok(self.0)
    }
}

/// Builder for Parquet [`ColumnIndex`], part of the Parquet [PageIndex]
///
/// [PageIndex]: https://github.com/apache/parquet-format/blob/master/PageIndex.md
/// [`ColumnIndex`]: https://github.com/apache/parquet-format/blob/master/PageIndex.md
pub struct ColumnIndexBuilder {
    column_type: Type,
    null_pages: Vec<bool>,
    min_values: Vec<Vec<u8>>,
    max_values: Vec<Vec<u8>>,
    null_counts: Vec<i64>,
    boundary_order: BoundaryOrder,
    /// contains the concatenation of the histograms of all pages
    repetition_level_histograms: Option<Vec<i64>>,
    /// contains the concatenation of the histograms of all pages
    definition_level_histograms: Option<Vec<i64>>,
    /// Is the information in the builder valid?
    ///
    /// Set to `false` if any entry in the page doesn't have statistics for
    /// some reason, so statistics for that page won't be written to the file.
    /// This might happen if the page is entirely null, or
    /// is a floating point column without any non-nan values
    /// e.g. <https://github.com/apache/parquet-format/pull/196>
    valid: bool,
}

impl ColumnIndexBuilder {
    /// Creates a new column index builder.
    pub fn new(column_type: Type) -> Self {
        ColumnIndexBuilder {
            column_type,
            null_pages: Vec::new(),
            min_values: Vec::new(),
            max_values: Vec::new(),
            null_counts: Vec::new(),
            boundary_order: BoundaryOrder::UNORDERED,
            repetition_level_histograms: None,
            definition_level_histograms: None,
            valid: true,
        }
    }

    /// Append statistics for the next page
    pub fn append(
        &mut self,
        null_page: bool,
        min_value: Vec<u8>,
        max_value: Vec<u8>,
        null_count: i64,
    ) {
        self.null_pages.push(null_page);
        self.min_values.push(min_value);
        self.max_values.push(max_value);
        self.null_counts.push(null_count);
    }

    /// Append the given page-level histograms to the [`ColumnIndex`] histograms.
    /// Does nothing if the `ColumnIndexBuilder` is not in the `valid` state.
    ///
    /// [`ColumnIndex`]: https://github.com/apache/parquet-format/blob/master/PageIndex.md
    pub fn append_histograms(
        &mut self,
        repetition_level_histogram: &Option<LevelHistogram>,
        definition_level_histogram: &Option<LevelHistogram>,
    ) {
        if !self.valid {
            return;
        }
        if let Some(rep_lvl_hist) = repetition_level_histogram {
            let hist = self.repetition_level_histograms.get_or_insert(Vec::new());
            hist.reserve(rep_lvl_hist.len());
            hist.extend(rep_lvl_hist.values());
        }
        if let Some(def_lvl_hist) = definition_level_histogram {
            let hist = self.definition_level_histograms.get_or_insert(Vec::new());
            hist.reserve(def_lvl_hist.len());
            hist.extend(def_lvl_hist.values());
        }
    }

    /// Set the boundary order of the column index
    pub fn set_boundary_order(&mut self, boundary_order: BoundaryOrder) {
        self.boundary_order = boundary_order;
    }

    /// Mark this column index as invalid
    pub fn to_invalid(&mut self) {
        self.valid = false;
    }

    /// Is the information in the builder valid?
    pub fn valid(&self) -> bool {
        self.valid
    }

    /// Build and get the column index
    ///
    /// Note: callers should check [`Self::valid`] before calling this method
    pub fn build(self) -> Result<ColumnIndexMetaData> {
        Ok(match self.column_type {
            Type::BOOLEAN => {
                let index = self.build_page_index()?;
                ColumnIndexMetaData::BOOLEAN(index)
            }
            Type::INT32 => {
                let index = self.build_page_index()?;
                ColumnIndexMetaData::INT32(index)
            }
            Type::INT64 => {
                let index = self.build_page_index()?;
                ColumnIndexMetaData::INT64(index)
            }
            Type::INT96 => {
                let index = self.build_page_index()?;
                ColumnIndexMetaData::INT96(index)
            }
            Type::FLOAT => {
                let index = self.build_page_index()?;
                ColumnIndexMetaData::FLOAT(index)
            }
            Type::DOUBLE => {
                let index = self.build_page_index()?;
                ColumnIndexMetaData::DOUBLE(index)
            }
            Type::BYTE_ARRAY => {
                let index = self.build_byte_array_index()?;
                ColumnIndexMetaData::BYTE_ARRAY(index)
            }
            Type::FIXED_LEN_BYTE_ARRAY => {
                let index = self.build_byte_array_index()?;
                ColumnIndexMetaData::FIXED_LEN_BYTE_ARRAY(index)
            }
        })
    }

    fn build_page_index<T>(self) -> Result<PrimitiveColumnIndex<T>>
    where
        T: ParquetValueType,
    {
        let min_values: Vec<&[u8]> = self.min_values.iter().map(|v| v.as_slice()).collect();
        let max_values: Vec<&[u8]> = self.max_values.iter().map(|v| v.as_slice()).collect();

        PrimitiveColumnIndex::try_new(
            self.null_pages,
            self.boundary_order,
            Some(self.null_counts),
            self.repetition_level_histograms,
            self.definition_level_histograms,
            min_values,
            max_values,
        )
    }

    fn build_byte_array_index(self) -> Result<ByteArrayColumnIndex> {
        let min_values: Vec<&[u8]> = self.min_values.iter().map(|v| v.as_slice()).collect();
        let max_values: Vec<&[u8]> = self.max_values.iter().map(|v| v.as_slice()).collect();

        ByteArrayColumnIndex::try_new(
            self.null_pages,
            self.boundary_order,
            Some(self.null_counts),
            self.repetition_level_histograms,
            self.definition_level_histograms,
            min_values,
            max_values,
        )
    }
}

impl From<ColumnChunkMetaData> for ColumnChunkMetaDataBuilder {
    fn from(value: ColumnChunkMetaData) -> Self {
        ColumnChunkMetaDataBuilder(value)
    }
}

/// Builder for offset index, part of the Parquet [PageIndex].
///
/// [PageIndex]: https://github.com/apache/parquet-format/blob/master/PageIndex.md
pub struct OffsetIndexBuilder {
    offset_array: Vec<i64>,
    compressed_page_size_array: Vec<i32>,
    first_row_index_array: Vec<i64>,
    unencoded_byte_array_data_bytes_array: Option<Vec<i64>>,
    current_first_row_index: i64,
}

impl Default for OffsetIndexBuilder {
    fn default() -> Self {
        Self::new()
    }
}

impl OffsetIndexBuilder {
    /// Creates a new offset index builder.
    pub fn new() -> Self {
        OffsetIndexBuilder {
            offset_array: Vec::new(),
            compressed_page_size_array: Vec::new(),
            first_row_index_array: Vec::new(),
            unencoded_byte_array_data_bytes_array: None,
            current_first_row_index: 0,
        }
    }

    /// Append the row count of the next page.
    pub fn append_row_count(&mut self, row_count: i64) {
        let current_page_row_index = self.current_first_row_index;
        self.first_row_index_array.push(current_page_row_index);
        self.current_first_row_index += row_count;
    }

    /// Append the offset and size of the next page.
    pub fn append_offset_and_size(&mut self, offset: i64, compressed_page_size: i32) {
        self.offset_array.push(offset);
        self.compressed_page_size_array.push(compressed_page_size);
    }

    /// Append the unencoded byte array data bytes of the next page.
    pub fn append_unencoded_byte_array_data_bytes(
        &mut self,
        unencoded_byte_array_data_bytes: Option<i64>,
    ) {
        if let Some(val) = unencoded_byte_array_data_bytes {
            self.unencoded_byte_array_data_bytes_array
                .get_or_insert(Vec::new())
                .push(val);
        }
    }

    /// Build and get the thrift metadata of offset index
    pub fn build(self) -> OffsetIndexMetaData {
        let locations = self
            .offset_array
            .iter()
            .zip(self.compressed_page_size_array.iter())
            .zip(self.first_row_index_array.iter())
            .map(|((offset, size), row_index)| PageLocation {
                offset: *offset,
                compressed_page_size: *size,
                first_row_index: *row_index,
            })
            .collect::<Vec<_>>();
        OffsetIndexMetaData {
            page_locations: locations,
            unencoded_byte_array_data_bytes: self.unencoded_byte_array_data_bytes_array,
        }
    }
}

#[cfg(test)]
mod tests {
    use super::*;
    use crate::basic::{PageType, SortOrder};
    use crate::file::metadata::thrift::tests::{read_column_chunk, read_row_group};

    #[test]
    fn test_row_group_metadata_thrift_conversion() {
        let schema_descr = get_test_schema_descr();

        let mut columns = vec![];
        for ptr in schema_descr.columns() {
            let column = ColumnChunkMetaData::builder(ptr.clone()).build().unwrap();
            columns.push(column);
        }
        let row_group_meta = RowGroupMetaData::builder(schema_descr.clone())
            .set_num_rows(1000)
            .set_total_byte_size(2000)
            .set_column_metadata(columns)
            .set_ordinal(1)
            .build()
            .unwrap();

        let mut buf = Vec::new();
        let mut writer = ThriftCompactOutputProtocol::new(&mut buf);
        row_group_meta.write_thrift(&mut writer).unwrap();

        let row_group_res = read_row_group(&mut buf, schema_descr).unwrap();

        assert_eq!(row_group_res, row_group_meta);
    }

    #[test]
    fn test_row_group_metadata_thrift_conversion_empty() {
        let schema_descr = get_test_schema_descr();

        let row_group_meta = RowGroupMetaData::builder(schema_descr).build();

        assert!(row_group_meta.is_err());
        if let Err(e) = row_group_meta {
            assert_eq!(
                format!("{e}"),
                "Parquet error: Column length mismatch: 2 != 0"
            );
        }
    }

    /// Test reading a corrupted Parquet file with 3 columns in its schema but only 2 in its row group
    #[test]
    fn test_row_group_metadata_thrift_corrupted() {
        let schema_descr_2cols = Arc::new(SchemaDescriptor::new(Arc::new(
            SchemaType::group_type_builder("schema")
                .with_fields(vec![
                    Arc::new(
                        SchemaType::primitive_type_builder("a", Type::INT32)
                            .build()
                            .unwrap(),
                    ),
                    Arc::new(
                        SchemaType::primitive_type_builder("b", Type::INT32)
                            .build()
                            .unwrap(),
                    ),
                ])
                .build()
                .unwrap(),
        )));

        let schema_descr_3cols = Arc::new(SchemaDescriptor::new(Arc::new(
            SchemaType::group_type_builder("schema")
                .with_fields(vec![
                    Arc::new(
                        SchemaType::primitive_type_builder("a", Type::INT32)
                            .build()
                            .unwrap(),
                    ),
                    Arc::new(
                        SchemaType::primitive_type_builder("b", Type::INT32)
                            .build()
                            .unwrap(),
                    ),
                    Arc::new(
                        SchemaType::primitive_type_builder("c", Type::INT32)
                            .build()
                            .unwrap(),
                    ),
                ])
                .build()
                .unwrap(),
        )));

        let row_group_meta_2cols = RowGroupMetaData::builder(schema_descr_2cols.clone())
            .set_num_rows(1000)
            .set_total_byte_size(2000)
            .set_column_metadata(vec![
                ColumnChunkMetaData::builder(schema_descr_2cols.column(0))
                    .build()
                    .unwrap(),
                ColumnChunkMetaData::builder(schema_descr_2cols.column(1))
                    .build()
                    .unwrap(),
            ])
            .set_ordinal(1)
            .build()
            .unwrap();
        let mut buf = Vec::new();
        let mut writer = ThriftCompactOutputProtocol::new(&mut buf);
        row_group_meta_2cols.write_thrift(&mut writer).unwrap();

        let err = read_row_group(&mut buf, schema_descr_3cols)
            .unwrap_err()
            .to_string();
        assert_eq!(
            err,
            "Parquet error: Column count mismatch. Schema has 3 columns while Row Group has 2"
        );
    }

    #[test]
    fn test_column_chunk_metadata_thrift_conversion() {
        let column_descr = get_test_schema_descr().column(0);
        let col_metadata = ColumnChunkMetaData::builder(column_descr.clone())
            .set_encodings_mask(EncodingMask::new_from_encodings(
                [Encoding::PLAIN, Encoding::RLE].iter(),
            ))
            .set_file_path("file_path".to_owned())
            .set_num_values(1000)
            .set_compression(Compression::SNAPPY)
            .set_total_compressed_size(2000)
            .set_total_uncompressed_size(3000)
            .set_data_page_offset(4000)
            .set_dictionary_page_offset(Some(5000))
            .set_page_encoding_stats(vec![
                PageEncodingStats {
                    page_type: PageType::DATA_PAGE,
                    encoding: Encoding::PLAIN,
                    count: 3,
                },
                PageEncodingStats {
                    page_type: PageType::DATA_PAGE,
                    encoding: Encoding::RLE,
                    count: 5,
                },
            ])
            .set_bloom_filter_offset(Some(6000))
            .set_bloom_filter_length(Some(25))
            .set_offset_index_offset(Some(7000))
            .set_offset_index_length(Some(25))
            .set_column_index_offset(Some(8000))
            .set_column_index_length(Some(25))
            .set_unencoded_byte_array_data_bytes(Some(2000))
            .set_repetition_level_histogram(Some(LevelHistogram::from(vec![100, 100])))
            .set_definition_level_histogram(Some(LevelHistogram::from(vec![0, 200])))
            .build()
            .unwrap();

        let mut buf = Vec::new();
        let mut writer = ThriftCompactOutputProtocol::new(&mut buf);
        col_metadata.write_thrift(&mut writer).unwrap();
        let col_chunk_res = read_column_chunk(&mut buf, column_descr).unwrap();

        assert_eq!(col_chunk_res, col_metadata);
    }

    #[test]
    fn test_column_chunk_metadata_thrift_conversion_empty() {
        let column_descr = get_test_schema_descr().column(0);

        let col_metadata = ColumnChunkMetaData::builder(column_descr.clone())
            .build()
            .unwrap();

        let mut buf = Vec::new();
        let mut writer = ThriftCompactOutputProtocol::new(&mut buf);
        col_metadata.write_thrift(&mut writer).unwrap();
        let col_chunk_res = read_column_chunk(&mut buf, column_descr).unwrap();

        assert_eq!(col_chunk_res, col_metadata);
    }

    #[test]
    fn test_compressed_size() {
        let schema_descr = get_test_schema_descr();

        let mut columns = vec![];
        for column_descr in schema_descr.columns() {
            let column = ColumnChunkMetaData::builder(column_descr.clone())
                .set_total_compressed_size(500)
                .set_total_uncompressed_size(700)
                .build()
                .unwrap();
            columns.push(column);
        }
        let row_group_meta = RowGroupMetaData::builder(schema_descr)
            .set_num_rows(1000)
            .set_column_metadata(columns)
            .build()
            .unwrap();

        let compressed_size_res: i64 = row_group_meta.compressed_size();
        let compressed_size_exp: i64 = 1000;

        assert_eq!(compressed_size_res, compressed_size_exp);
    }

    #[test]
    fn test_memory_size() {
        let schema_descr = get_test_schema_descr();

        let columns = schema_descr
            .columns()
            .iter()
            .map(|column_descr| {
                ColumnChunkMetaData::builder(column_descr.clone())
                    .set_statistics(Statistics::new::<i32>(None, None, None, None, false))
                    .build()
            })
            .collect::<Result<Vec<_>>>()
            .unwrap();
        let row_group_meta = RowGroupMetaData::builder(schema_descr.clone())
            .set_num_rows(1000)
            .set_column_metadata(columns)
            .build()
            .unwrap();
        let row_group_meta = vec![row_group_meta];

        let version = 2;
        let num_rows = 1000;
        let created_by = Some(String::from("test harness"));
        let key_value_metadata = Some(vec![KeyValue::new(
            String::from("Foo"),
            Some(String::from("bar")),
        )]);
        let column_orders = Some(vec![
            ColumnOrder::UNDEFINED,
            ColumnOrder::TYPE_DEFINED_ORDER(SortOrder::UNSIGNED),
        ]);
        let file_metadata = FileMetaData::new(
            version,
            num_rows,
            created_by,
            key_value_metadata,
            schema_descr.clone(),
            column_orders,
        );

        // Now, add in Exact Statistics
        let columns_with_stats = schema_descr
            .columns()
            .iter()
            .map(|column_descr| {
                ColumnChunkMetaData::builder(column_descr.clone())
                    .set_statistics(Statistics::new::<i32>(
                        Some(0),
                        Some(100),
                        None,
                        None,
                        false,
                    ))
                    .build()
            })
            .collect::<Result<Vec<_>>>()
            .unwrap();

        let row_group_meta_with_stats = RowGroupMetaData::builder(schema_descr)
            .set_num_rows(1000)
            .set_column_metadata(columns_with_stats)
            .build()
            .unwrap();
        let row_group_meta_with_stats = vec![row_group_meta_with_stats];

        let parquet_meta = ParquetMetaDataBuilder::new(file_metadata.clone())
            .set_row_groups(row_group_meta_with_stats)
            .build();

        #[cfg(not(feature = "encryption"))]
        let base_expected_size = 2766;
        #[cfg(feature = "encryption")]
        let base_expected_size = 2934;

        assert_eq!(parquet_meta.memory_size(), base_expected_size);

        let mut column_index = ColumnIndexBuilder::new(Type::BOOLEAN);
        column_index.append(false, vec![1u8], vec![2u8, 3u8], 4);
        let column_index = column_index.build().unwrap();
        let native_index = match column_index {
            ColumnIndexMetaData::BOOLEAN(index) => index,
            _ => panic!("wrong type of column index"),
        };

        // Now, add in OffsetIndex
        let mut offset_index = OffsetIndexBuilder::new();
        offset_index.append_row_count(1);
        offset_index.append_offset_and_size(2, 3);
        offset_index.append_unencoded_byte_array_data_bytes(Some(10));
        offset_index.append_row_count(1);
        offset_index.append_offset_and_size(2, 3);
        offset_index.append_unencoded_byte_array_data_bytes(Some(10));
        let offset_index = offset_index.build();

        let parquet_meta = ParquetMetaDataBuilder::new(file_metadata)
            .set_row_groups(row_group_meta)
            .set_column_index(Some(vec![vec![ColumnIndexMetaData::BOOLEAN(native_index)]]))
            .set_offset_index(Some(vec![vec![offset_index]]))
            .build();

        #[cfg(not(feature = "encryption"))]
        let bigger_expected_size = 3192;
        #[cfg(feature = "encryption")]
        let bigger_expected_size = 3360;

        // more set fields means more memory usage
        assert!(bigger_expected_size > base_expected_size);
        assert_eq!(parquet_meta.memory_size(), bigger_expected_size);
    }

    #[test]
    #[cfg(feature = "encryption")]
    fn test_memory_size_with_decryptor() {
        use crate::encryption::decrypt::FileDecryptionProperties;
        use crate::file::metadata::thrift::encryption::AesGcmV1;

        let schema_descr = get_test_schema_descr();

        let columns = schema_descr
            .columns()
            .iter()
            .map(|column_descr| ColumnChunkMetaData::builder(column_descr.clone()).build())
            .collect::<Result<Vec<_>>>()
            .unwrap();
        let row_group_meta = RowGroupMetaData::builder(schema_descr.clone())
            .set_num_rows(1000)
            .set_column_metadata(columns)
            .build()
            .unwrap();
        let row_group_meta = vec![row_group_meta];

        let version = 2;
        let num_rows = 1000;
        let aad_file_unique = vec![1u8; 8];
        let aad_prefix = vec![2u8; 8];
        let encryption_algorithm = EncryptionAlgorithm::AES_GCM_V1(AesGcmV1 {
            aad_prefix: Some(aad_prefix.clone()),
            aad_file_unique: Some(aad_file_unique.clone()),
            supply_aad_prefix: Some(true),
        });
        let footer_key_metadata = Some(vec![3u8; 8]);
        let file_metadata =
            FileMetaData::new(version, num_rows, None, None, schema_descr.clone(), None)
                .with_encryption_algorithm(Some(encryption_algorithm))
                .with_footer_signing_key_metadata(footer_key_metadata.clone());

        let parquet_meta_data = ParquetMetaDataBuilder::new(file_metadata.clone())
            .set_row_groups(row_group_meta.clone())
            .build();

        let base_expected_size = 2058;
        assert_eq!(parquet_meta_data.memory_size(), base_expected_size);

        let footer_key = "0123456789012345".as_bytes();
        let column_key = "1234567890123450".as_bytes();
        let mut decryption_properties_builder =
            FileDecryptionProperties::builder(footer_key.to_vec())
                .with_aad_prefix(aad_prefix.clone());
        for column in schema_descr.columns() {
            decryption_properties_builder = decryption_properties_builder
                .with_column_key(&column.path().string(), column_key.to_vec());
        }
        let decryption_properties = decryption_properties_builder.build().unwrap();
        let decryptor = FileDecryptor::new(
            &decryption_properties,
            footer_key_metadata.as_deref(),
            aad_file_unique,
            aad_prefix,
        )
        .unwrap();

        let parquet_meta_data = ParquetMetaDataBuilder::new(file_metadata.clone())
            .set_row_groups(row_group_meta.clone())
            .set_file_decryptor(Some(decryptor))
            .build();

        let expected_size_with_decryptor = 3072;
        assert!(expected_size_with_decryptor > base_expected_size);

        assert_eq!(
            parquet_meta_data.memory_size(),
            expected_size_with_decryptor
        );
    }

    /// Returns sample schema descriptor so we can create column metadata.
    fn get_test_schema_descr() -> SchemaDescPtr {
        let schema = SchemaType::group_type_builder("schema")
            .with_fields(vec![
                Arc::new(
                    SchemaType::primitive_type_builder("a", Type::INT32)
                        .build()
                        .unwrap(),
                ),
                Arc::new(
                    SchemaType::primitive_type_builder("b", Type::INT32)
                        .build()
                        .unwrap(),
                ),
            ])
            .build()
            .unwrap();

        Arc::new(SchemaDescriptor::new(Arc::new(schema)))
    }
}
