Skip to content

Commit 8bed541

Browse files
albertlockettalbertlockett
and
albertlockett
authored
feat: Support round trip reading/writing Arrow type Dictionary(_, FixedSizeBinary(_)) to Parquet (#7446)
* support FixedSizedBinary in dict encoding * roundtrip works * cleanup * clippy and linter * support all types of keys in byte_array_dictionary * back out change included by mistake * linter * PR feedback before cleanup * PR feedback from Weston --------- Co-authored-by: albertlockett <[email protected]>
1 parent 8fb2270 commit 8bed541

File tree

5 files changed

+76
-13
lines changed

5 files changed

+76
-13
lines changed

parquet/src/arrow/array_reader/builder.rs

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -289,9 +289,12 @@ fn build_primitive_reader(
289289
}
290290
_ => make_byte_array_reader(page_iterator, column_desc, arrow_type)?,
291291
},
292-
PhysicalType::FIXED_LEN_BYTE_ARRAY => {
293-
make_fixed_len_byte_array_reader(page_iterator, column_desc, arrow_type)?
294-
}
292+
PhysicalType::FIXED_LEN_BYTE_ARRAY => match arrow_type {
293+
Some(DataType::Dictionary(_, _)) => {
294+
make_byte_array_dictionary_reader(page_iterator, column_desc, arrow_type)?
295+
}
296+
_ => make_fixed_len_byte_array_reader(page_iterator, column_desc, arrow_type)?,
297+
},
295298
};
296299
Ok(Some(reader))
297300
}

parquet/src/arrow/array_reader/byte_array_dictionary.rs

Lines changed: 8 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -90,21 +90,21 @@ pub fn make_byte_array_dictionary_reader(
9090
ArrowType::Dictionary(key_type, value_type) => {
9191
make_reader! {
9292
(pages, column_desc, data_type) => match (key_type.as_ref(), value_type.as_ref()) {
93-
(ArrowType::UInt8, ArrowType::Binary | ArrowType::Utf8) => (u8, i32),
93+
(ArrowType::UInt8, ArrowType::Binary | ArrowType::Utf8 | ArrowType::FixedSizeBinary(_)) => (u8, i32),
9494
(ArrowType::UInt8, ArrowType::LargeBinary | ArrowType::LargeUtf8) => (u8, i64),
95-
(ArrowType::Int8, ArrowType::Binary | ArrowType::Utf8) => (i8, i32),
95+
(ArrowType::Int8, ArrowType::Binary | ArrowType::Utf8 | ArrowType::FixedSizeBinary(_)) => (i8, i32),
9696
(ArrowType::Int8, ArrowType::LargeBinary | ArrowType::LargeUtf8) => (i8, i64),
97-
(ArrowType::UInt16, ArrowType::Binary | ArrowType::Utf8) => (u16, i32),
97+
(ArrowType::UInt16, ArrowType::Binary | ArrowType::Utf8 | ArrowType::FixedSizeBinary(_)) => (u16, i32),
9898
(ArrowType::UInt16, ArrowType::LargeBinary | ArrowType::LargeUtf8) => (u16, i64),
99-
(ArrowType::Int16, ArrowType::Binary | ArrowType::Utf8) => (i16, i32),
99+
(ArrowType::Int16, ArrowType::Binary | ArrowType::Utf8 | ArrowType::FixedSizeBinary(_)) => (i16, i32),
100100
(ArrowType::Int16, ArrowType::LargeBinary | ArrowType::LargeUtf8) => (i16, i64),
101-
(ArrowType::UInt32, ArrowType::Binary | ArrowType::Utf8) => (u32, i32),
101+
(ArrowType::UInt32, ArrowType::Binary | ArrowType::Utf8 | ArrowType::FixedSizeBinary(_)) => (u32, i32),
102102
(ArrowType::UInt32, ArrowType::LargeBinary | ArrowType::LargeUtf8) => (u32, i64),
103-
(ArrowType::Int32, ArrowType::Binary | ArrowType::Utf8) => (i32, i32),
103+
(ArrowType::Int32, ArrowType::Binary | ArrowType::Utf8 | ArrowType::FixedSizeBinary(_)) => (i32, i32),
104104
(ArrowType::Int32, ArrowType::LargeBinary | ArrowType::LargeUtf8) => (i32, i64),
105-
(ArrowType::UInt64, ArrowType::Binary | ArrowType::Utf8) => (u64, i32),
105+
(ArrowType::UInt64, ArrowType::Binary | ArrowType::Utf8 | ArrowType::FixedSizeBinary(_)) => (u64, i32),
106106
(ArrowType::UInt64, ArrowType::LargeBinary | ArrowType::LargeUtf8) => (u64, i64),
107-
(ArrowType::Int64, ArrowType::Binary | ArrowType::Utf8) => (i64, i32),
107+
(ArrowType::Int64, ArrowType::Binary | ArrowType::Utf8 | ArrowType::FixedSizeBinary(_)) => (i64, i32),
108108
(ArrowType::Int64, ArrowType::LargeBinary | ArrowType::LargeUtf8) => (i64, i64),
109109
}
110110
}

parquet/src/arrow/arrow_writer/byte_array.rs

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -27,8 +27,8 @@ use crate::schema::types::ColumnDescPtr;
2727
use crate::util::bit_util::num_required_bits;
2828
use crate::util::interner::{Interner, Storage};
2929
use arrow_array::{
30-
Array, ArrayAccessor, BinaryArray, BinaryViewArray, DictionaryArray, LargeBinaryArray,
31-
LargeStringArray, StringArray, StringViewArray,
30+
Array, ArrayAccessor, BinaryArray, BinaryViewArray, DictionaryArray, FixedSizeBinaryArray,
31+
LargeBinaryArray, LargeStringArray, StringArray, StringViewArray,
3232
};
3333
use arrow_schema::DataType;
3434

@@ -85,6 +85,9 @@ macro_rules! downcast_op {
8585
DataType::LargeBinary => {
8686
downcast_dict_op!(key, LargeBinaryArray, $array, $op$(, $arg)*)
8787
}
88+
DataType::FixedSizeBinary(_) => {
89+
downcast_dict_op!(key, FixedSizeBinaryArray, $array, $op$(, $arg)*)
90+
}
8891
d => unreachable!("cannot downcast {} dictionary value to byte array", d),
8992
},
9093
d => unreachable!("cannot downcast {} to byte array", d),

parquet/src/arrow/arrow_writer/mod.rs

Lines changed: 48 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -989,6 +989,9 @@ impl ArrowColumnWriterFactory {
989989
ArrowDataType::Utf8View | ArrowDataType::BinaryView => {
990990
out.push(bytes(leaves.next().unwrap())?)
991991
}
992+
ArrowDataType::FixedSizeBinary(_) => {
993+
out.push(bytes(leaves.next().unwrap())?)
994+
}
992995
_ => {
993996
out.push(col(leaves.next().unwrap())?)
994997
}
@@ -1333,6 +1336,7 @@ mod tests {
13331336
use arrow_buffer::{i256, IntervalDayTime, IntervalMonthDayNano, NullBuffer};
13341337
use arrow_schema::Fields;
13351338
use half::f16;
1339+
use num::{FromPrimitive, ToPrimitive};
13361340

13371341
use crate::basic::Encoding;
13381342
use crate::data_type::AsBytes;
@@ -1911,6 +1915,50 @@ mod tests {
19111915
roundtrip(batch, Some(SMALL_SIZE / 2));
19121916
}
19131917

1918+
#[test]
1919+
fn test_fixed_size_binary_in_dict() {
1920+
fn test_fixed_size_binary_in_dict_inner<K>()
1921+
where
1922+
K: ArrowDictionaryKeyType,
1923+
K::Native: FromPrimitive + ToPrimitive + TryFrom<u8>,
1924+
<<K as arrow_array::ArrowPrimitiveType>::Native as TryFrom<u8>>::Error: std::fmt::Debug,
1925+
{
1926+
let field = Field::new(
1927+
"a",
1928+
DataType::Dictionary(
1929+
Box::new(K::DATA_TYPE),
1930+
Box::new(DataType::FixedSizeBinary(4)),
1931+
),
1932+
false,
1933+
);
1934+
let schema = Schema::new(vec![field]);
1935+
1936+
let keys: Vec<K::Native> = vec![
1937+
K::Native::try_from(0u8).unwrap(),
1938+
K::Native::try_from(0u8).unwrap(),
1939+
K::Native::try_from(1u8).unwrap(),
1940+
];
1941+
let keys = PrimitiveArray::<K>::from_iter_values(keys);
1942+
let values = FixedSizeBinaryArray::try_from_iter(
1943+
vec![vec![0, 0, 0, 0], vec![1, 1, 1, 1]].into_iter(),
1944+
)
1945+
.unwrap();
1946+
1947+
let data = DictionaryArray::<K>::new(keys, Arc::new(values));
1948+
let batch = RecordBatch::try_new(Arc::new(schema), vec![Arc::new(data)]).unwrap();
1949+
roundtrip(batch, None);
1950+
}
1951+
1952+
test_fixed_size_binary_in_dict_inner::<UInt8Type>();
1953+
test_fixed_size_binary_in_dict_inner::<UInt16Type>();
1954+
test_fixed_size_binary_in_dict_inner::<UInt32Type>();
1955+
test_fixed_size_binary_in_dict_inner::<UInt16Type>();
1956+
test_fixed_size_binary_in_dict_inner::<Int8Type>();
1957+
test_fixed_size_binary_in_dict_inner::<Int16Type>();
1958+
test_fixed_size_binary_in_dict_inner::<Int32Type>();
1959+
test_fixed_size_binary_in_dict_inner::<Int64Type>();
1960+
}
1961+
19141962
#[test]
19151963
fn test_empty_dict() {
19161964
let struct_fields = Fields::from(vec![Field::new(

parquet/src/arrow/buffer/dictionary_buffer.rs

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -154,6 +154,15 @@ impl<K: ArrowNativeType + Ord, V: OffsetSizeTrait> DictionaryBuffer<K, V> {
154154
}
155155
}
156156

157+
let ArrowType::Dictionary(_, value_type) = data_type else {
158+
unreachable!()
159+
};
160+
let values = if let ArrowType::FixedSizeBinary(size) = **value_type {
161+
arrow_cast::cast(&values, &ArrowType::FixedSizeBinary(size)).unwrap()
162+
} else {
163+
values
164+
};
165+
157166
let builder = ArrayDataBuilder::new(data_type.clone())
158167
.len(keys.len())
159168
.add_buffer(Buffer::from_vec(keys))

0 commit comments

Comments
 (0)