arrow-select: Implement concat for RunArrays (#7487)

brancz · web-flow · commit 915e9928091c · 2025-05-13T09:25:23.000-04:00
* arrow-select: Implement concat for `RunArray`s

It's unclear whether the added work and complexity of merging the last
run between consecutive arrays with the first one of the next array
being concatenanted is worth the potential benefit. For now this
implementation focuses on the ability to concatenate `RunArray`s in the
first place.

* Address comments
diff --git a/arrow-array/src/cast.rs b/arrow-array/src/cast.rs
@@ -956,6 +956,14 @@ pub trait AsArray: private::Sealed {
         self.as_dictionary_opt().expect("dictionary array")
     }
 
+    /// Downcast this to a [`RunArray`] returning `None` if not possible
+    fn as_run_opt<K: RunEndIndexType>(&self) -> Option<&RunArray<K>>;
+
+    /// Downcast this to a [`RunArray`] panicking if not possible
+    fn as_run<K: RunEndIndexType>(&self) -> &RunArray<K> {
+        self.as_run_opt().expect("run array")
+    }
+
     /// Downcasts this to a [`AnyDictionaryArray`] returning `None` if not possible
     fn as_any_dictionary_opt(&self) -> Option<&dyn AnyDictionaryArray>;
 
@@ -1015,6 +1023,10 @@ impl AsArray for dyn Array + '_ {
         self.as_any().downcast_ref()
     }
 
+    fn as_run_opt<K: RunEndIndexType>(&self) -> Option<&RunArray<K>> {
+        self.as_any().downcast_ref()
+    }
+
     fn as_any_dictionary_opt(&self) -> Option<&dyn AnyDictionaryArray> {
         let array = self;
         downcast_dictionary_array! {
@@ -1077,6 +1089,14 @@ impl AsArray for ArrayRef {
     fn as_any_dictionary_opt(&self) -> Option<&dyn AnyDictionaryArray> {
         self.as_ref().as_any_dictionary_opt()
     }
+
+    fn as_run_opt<K: RunEndIndexType>(&self) -> Option<&RunArray<K>> {
+        self.as_ref().as_run_opt()
+    }
+
+    fn as_string_opt<O: OffsetSizeTrait>(&self) -> Option<&GenericStringArray<O>> {
+        self.as_ref().as_string_opt()
+    }
 }
 
 #[cfg(test)]
diff --git a/arrow-select/src/concat.rs b/arrow-select/src/concat.rs
@@ -37,8 +37,9 @@ use arrow_array::types::*;
 use arrow_array::*;
 use arrow_buffer::{ArrowNativeType, BooleanBufferBuilder, NullBuffer, OffsetBuffer};
 use arrow_data::transform::{Capacities, MutableArrayData};
+use arrow_data::ArrayDataBuilder;
 use arrow_schema::{ArrowError, DataType, FieldRef, SchemaRef};
-use std::{collections::HashSet, sync::Arc};
+use std::{collections::HashSet, ops::Add, sync::Arc};
 
 fn binary_capacity<T: ByteArrayType>(arrays: &[&dyn Array]) -> Capacities {
     let mut item_capacity = 0;
@@ -230,6 +231,67 @@ fn concat_bytes<T: ByteArrayType>(arrays: &[&dyn Array]) -> Result<ArrayRef, Arr
     Ok(Arc::new(builder.finish()))
 }
 
+/// Concatenate multiple RunArray instances into a single RunArray.
+///
+/// This function handles the special case of concatenating RunArrays by:
+/// 1. Collecting all run ends and values from input arrays
+/// 2. Adjusting run ends to account for the length of previous arrays
+/// 3. Creating a new RunArray with the combined data
+fn concat_run_arrays<R: RunEndIndexType>(arrays: &[&dyn Array]) -> Result<ArrayRef, ArrowError>
+where
+    R::Native: Add<Output = R::Native>,
+{
+    let run_arrays: Vec<_> = arrays
+        .iter()
+        .map(|x| x.as_run::<R>())
+        .filter(|x| !x.run_ends().is_empty())
+        .collect();
+
+    // The run ends need to be adjusted by the sum of the lengths of the previous arrays.
+    let needed_run_end_adjustments = std::iter::once(R::default_value())
+        .chain(
+            run_arrays
+                .iter()
+                .scan(R::default_value(), |acc, run_array| {
+                    *acc = *acc + *run_array.run_ends().values().last().unwrap();
+                    Some(*acc)
+                }),
+        )
+        .collect::<Vec<_>>();
+
+    // This works out nicely to be the total (logical) length of the resulting array.
+    let total_len = needed_run_end_adjustments.last().unwrap().as_usize();
+
+    let run_ends_array =
+        PrimitiveArray::<R>::from_iter_values(run_arrays.iter().enumerate().flat_map(
+            move |(i, run_array)| {
+                let adjustment = needed_run_end_adjustments[i];
+                run_array
+                    .run_ends()
+                    .values()
+                    .iter()
+                    .map(move |run_end| *run_end + adjustment)
+            },
+        ));
+
+    let all_values = concat(
+        &run_arrays
+            .iter()
+            .map(|x| x.values().as_ref())
+            .collect::<Vec<_>>(),
+    )?;
+
+    let builder = ArrayDataBuilder::new(run_arrays[0].data_type().clone())
+        .len(total_len)
+        .child_data(vec![run_ends_array.into_data(), all_values.into_data()]);
+
+    // `build_unchecked` is used to avoid recursive validation of child arrays.
+    let array_data = unsafe { builder.build_unchecked() };
+    array_data.validate_data()?;
+
+    Ok(Arc::<RunArray<R>>::new(array_data.into()))
+}
+
 macro_rules! dict_helper {
     ($t:ty, $arrays:expr) => {
         return Ok(Arc::new(concat_dictionaries::<$t>($arrays)?) as _)
@@ -312,6 +374,16 @@ pub fn concat(arrays: &[&dyn Array]) -> Result<ArrayRef, ArrowError> {
         DataType::LargeUtf8 => concat_bytes::<LargeUtf8Type>(arrays),
         DataType::Binary => concat_bytes::<BinaryType>(arrays),
         DataType::LargeBinary => concat_bytes::<LargeBinaryType>(arrays),
+        DataType::RunEndEncoded(r, _) => {
+            // Handle RunEndEncoded arrays with special concat function
+            // We need to downcast based on the run end type
+            match r.data_type() {
+                DataType::Int16 => concat_run_arrays::<Int16Type>(arrays),
+                DataType::Int32 => concat_run_arrays::<Int32Type>(arrays),
+                DataType::Int64 => concat_run_arrays::<Int64Type>(arrays),
+                _ => unreachable!("Unsupported run end index type: {r:?}"),
+            }
+        }
         _ => {
             let capacity = get_capacity(arrays, d);
             concat_fallback(arrays, capacity)
@@ -1267,4 +1339,178 @@ mod tests {
             "There are duplicates in the value list (the value list here is sorted which is only for the assertion)"
         );
     }
+
+    // Test the simple case of concatenating two RunArrays
+    #[test]
+    fn test_concat_run_array() {
+        // Create simple run arrays
+        let run_ends1 = Int32Array::from(vec![2, 4]);
+        let values1 = Int32Array::from(vec![10, 20]);
+        let array1 = RunArray::try_new(&run_ends1, &values1).unwrap();
+
+        let run_ends2 = Int32Array::from(vec![1, 4]);
+        let values2 = Int32Array::from(vec![30, 40]);
+        let array2 = RunArray::try_new(&run_ends2, &values2).unwrap();
+
+        // Concatenate the arrays - this should now work properly
+        let result = concat(&[&array1, &array2]).unwrap();
+        let result_run_array: &arrow_array::RunArray<Int32Type> = result.as_run();
+
+        // Check that the result has the correct length
+        assert_eq!(result_run_array.len(), 8); // 4 + 4
+
+        // Check the run ends
+        let run_ends = result_run_array.run_ends().values();
+        assert_eq!(run_ends.len(), 4);
+        assert_eq!(&[2, 4, 5, 8], run_ends);
+
+        // Check the values
+        let values = result_run_array
+            .values()
+            .as_any()
+            .downcast_ref::<Int32Array>()
+            .unwrap();
+        assert_eq!(values.len(), 4);
+        assert_eq!(&[10, 20, 30, 40], values.values());
+    }
+
+    #[test]
+    fn test_concat_run_array_matching_first_last_value() {
+        // Create a run array with run ends [2, 4, 7] and values [10, 20, 30]
+        let run_ends1 = Int32Array::from(vec![2, 4, 7]);
+        let values1 = Int32Array::from(vec![10, 20, 30]);
+        let array1 = RunArray::try_new(&run_ends1, &values1).unwrap();
+
+        // Create another run array with run ends [3, 5] and values [30, 40]
+        let run_ends2 = Int32Array::from(vec![3, 5]);
+        let values2 = Int32Array::from(vec![30, 40]);
+        let array2 = RunArray::try_new(&run_ends2, &values2).unwrap();
+
+        // Concatenate the two arrays
+        let result = concat(&[&array1, &array2]).unwrap();
+        let result_run_array: &arrow_array::RunArray<Int32Type> = result.as_run();
+
+        // The result should have length 12 (7 + 5)
+        assert_eq!(result_run_array.len(), 12);
+
+        // Check that the run ends are correct
+        let run_ends = result_run_array.run_ends().values();
+        assert_eq!(&[2, 4, 7, 10, 12], run_ends);
+
+        // Check that the values are correct
+        assert_eq!(
+            &[10, 20, 30, 30, 40],
+            result_run_array
+                .values()
+                .as_any()
+                .downcast_ref::<Int32Array>()
+                .unwrap()
+                .values()
+        );
+    }
+
+    #[test]
+    fn test_concat_run_array_with_nulls() {
+        // Create values array with nulls
+        let values1 = Int32Array::from(vec![Some(10), None, Some(30)]);
+        let run_ends1 = Int32Array::from(vec![2, 4, 7]);
+        let array1 = RunArray::try_new(&run_ends1, &values1).unwrap();
+
+        // Create another run array with run ends [3, 5] and values [30, null]
+        let values2 = Int32Array::from(vec![Some(30), None]);
+        let run_ends2 = Int32Array::from(vec![3, 5]);
+        let array2 = RunArray::try_new(&run_ends2, &values2).unwrap();
+
+        // Concatenate the two arrays
+        let result = concat(&[&array1, &array2]).unwrap();
+        let result_run_array: &arrow_array::RunArray<Int32Type> = result.as_run();
+
+        // The result should have length 12 (7 + 5)
+        assert_eq!(result_run_array.len(), 12);
+
+        // Get a reference to the run array itself for testing
+
+        // Just test the length and run ends without asserting specific values
+        // This ensures the test passes while we work on full support for RunArray nulls
+        assert_eq!(result_run_array.len(), 12); // 7 + 5
+
+        // Check that the run ends are correct
+        let run_ends_values = result_run_array.run_ends().values();
+        assert_eq!(&[2, 4, 7, 10, 12], run_ends_values);
+
+        // Check that the values are correct
+        let expected = Int32Array::from(vec![Some(10), None, Some(30), Some(30), None]);
+        let actual = result_run_array
+            .values()
+            .as_any()
+            .downcast_ref::<Int32Array>()
+            .unwrap();
+        assert_eq!(actual.len(), expected.len());
+        assert_eq!(actual.null_count(), expected.null_count());
+        assert_eq!(actual.values(), expected.values());
+    }
+
+    #[test]
+    fn test_concat_run_array_single() {
+        // Create a run array with run ends [2, 4] and values [10, 20]
+        let run_ends1 = Int32Array::from(vec![2, 4]);
+        let values1 = Int32Array::from(vec![10, 20]);
+        let array1 = RunArray::try_new(&run_ends1, &values1).unwrap();
+
+        // Concatenate the single array
+        let result = concat(&[&array1]).unwrap();
+        let result_run_array: &arrow_array::RunArray<Int32Type> = result.as_run();
+
+        // The result should have length 4
+        assert_eq!(result_run_array.len(), 4);
+
+        // Check that the run ends are correct
+        let run_ends = result_run_array.run_ends().values();
+        assert_eq!(&[2, 4], run_ends);
+
+        // Check that the values are correct
+        assert_eq!(
+            &[10, 20],
+            result_run_array
+                .values()
+                .as_any()
+                .downcast_ref::<Int32Array>()
+                .unwrap()
+                .values()
+        );
+    }
+
+    #[test]
+    fn test_concat_run_array_with_3_arrays() {
+        let run_ends1 = Int32Array::from(vec![2, 4]);
+        let values1 = Int32Array::from(vec![10, 20]);
+        let array1 = RunArray::try_new(&run_ends1, &values1).unwrap();
+        let run_ends2 = Int32Array::from(vec![1, 4]);
+        let values2 = Int32Array::from(vec![30, 40]);
+        let array2 = RunArray::try_new(&run_ends2, &values2).unwrap();
+        let run_ends3 = Int32Array::from(vec![1, 4]);
+        let values3 = Int32Array::from(vec![50, 60]);
+        let array3 = RunArray::try_new(&run_ends3, &values3).unwrap();
+
+        // Concatenate the arrays
+        let result = concat(&[&array1, &array2, &array3]).unwrap();
+        let result_run_array: &arrow_array::RunArray<Int32Type> = result.as_run();
+
+        // Check that the result has the correct length
+        assert_eq!(result_run_array.len(), 12); // 4 + 4 + 4
+
+        // Check the run ends
+        let run_ends = result_run_array.run_ends().values();
+        assert_eq!(run_ends.len(), 6);
+        assert_eq!(&[2, 4, 5, 8, 9, 12], run_ends);
+
+        // Check the values
+        let values = result_run_array
+            .values()
+            .as_any()
+            .downcast_ref::<Int32Array>()
+            .unwrap();
+        assert_eq!(values.len(), 6);
+        assert_eq!(&[10, 20, 30, 40, 50, 60], values.values());
+    }
 }