@@ -1135,6 +1135,16 @@ mod tests {
1135
1135
requests : Arc < Mutex < Vec < Range < usize > > > > ,
1136
1136
}
1137
1137
1138
+ impl TestReader {
1139
+ fn new ( data : Bytes ) -> Self {
1140
+ Self {
1141
+ data,
1142
+ metadata : Default :: default ( ) ,
1143
+ requests : Default :: default ( ) ,
1144
+ }
1145
+ }
1146
+ }
1147
+
1138
1148
impl AsyncFileReader for TestReader {
1139
1149
fn get_bytes ( & mut self , range : Range < u64 > ) -> BoxFuture < ' _ , Result < Bytes > > {
1140
1150
let range = range. clone ( ) ;
@@ -1167,11 +1177,7 @@ mod tests {
1167
1177
let path = format ! ( "{testdata}/alltypes_plain.parquet" ) ;
1168
1178
let data = Bytes :: from ( std:: fs:: read ( path) . unwrap ( ) ) ;
1169
1179
1170
- let async_reader = TestReader {
1171
- data : data. clone ( ) ,
1172
- metadata : Default :: default ( ) ,
1173
- requests : Default :: default ( ) ,
1174
- } ;
1180
+ let async_reader = TestReader :: new ( data. clone ( ) ) ;
1175
1181
1176
1182
let requests = async_reader. requests . clone ( ) ;
1177
1183
let builder = ParquetRecordBatchStreamBuilder :: new ( async_reader)
@@ -1220,11 +1226,7 @@ mod tests {
1220
1226
let path = format ! ( "{testdata}/alltypes_plain.parquet" ) ;
1221
1227
let data = Bytes :: from ( std:: fs:: read ( path) . unwrap ( ) ) ;
1222
1228
1223
- let async_reader = TestReader {
1224
- data : data. clone ( ) ,
1225
- metadata : Default :: default ( ) ,
1226
- requests : Default :: default ( ) ,
1227
- } ;
1229
+ let async_reader = TestReader :: new ( data. clone ( ) ) ;
1228
1230
1229
1231
let requests = async_reader. requests . clone ( ) ;
1230
1232
let builder = ParquetRecordBatchStreamBuilder :: new ( async_reader)
@@ -1281,11 +1283,7 @@ mod tests {
1281
1283
let path = format ! ( "{testdata}/alltypes_tiny_pages_plain.parquet" ) ;
1282
1284
let data = Bytes :: from ( std:: fs:: read ( path) . unwrap ( ) ) ;
1283
1285
1284
- let async_reader = TestReader {
1285
- data : data. clone ( ) ,
1286
- metadata : Default :: default ( ) ,
1287
- requests : Default :: default ( ) ,
1288
- } ;
1286
+ let async_reader = TestReader :: new ( data. clone ( ) ) ;
1289
1287
1290
1288
let options = ArrowReaderOptions :: new ( ) . with_page_index ( true ) ;
1291
1289
let builder = ParquetRecordBatchStreamBuilder :: new_with_options ( async_reader, options)
@@ -1350,11 +1348,7 @@ mod tests {
1350
1348
1351
1349
assert_eq ! ( metadata. num_row_groups( ) , 1 ) ;
1352
1350
1353
- let async_reader = TestReader {
1354
- data : data. clone ( ) ,
1355
- metadata : Default :: default ( ) ,
1356
- requests : Default :: default ( ) ,
1357
- } ;
1351
+ let async_reader = TestReader :: new ( data. clone ( ) ) ;
1358
1352
1359
1353
let builder = ParquetRecordBatchStreamBuilder :: new ( async_reader)
1360
1354
. await
@@ -1391,11 +1385,7 @@ mod tests {
1391
1385
let path = format ! ( "{testdata}/alltypes_tiny_pages_plain.parquet" ) ;
1392
1386
let data = Bytes :: from ( std:: fs:: read ( path) . unwrap ( ) ) ;
1393
1387
1394
- let async_reader = TestReader {
1395
- data : data. clone ( ) ,
1396
- metadata : Default :: default ( ) ,
1397
- requests : Default :: default ( ) ,
1398
- } ;
1388
+ let async_reader = TestReader :: new ( data. clone ( ) ) ;
1399
1389
1400
1390
let options = ArrowReaderOptions :: new ( ) . with_page_index ( true ) ;
1401
1391
let builder = ParquetRecordBatchStreamBuilder :: new_with_options ( async_reader, options)
@@ -1469,11 +1459,7 @@ mod tests {
1469
1459
1470
1460
let selection = RowSelection :: from ( selectors) ;
1471
1461
1472
- let async_reader = TestReader {
1473
- data : data. clone ( ) ,
1474
- metadata : Default :: default ( ) ,
1475
- requests : Default :: default ( ) ,
1476
- } ;
1462
+ let async_reader = TestReader :: new ( data. clone ( ) ) ;
1477
1463
1478
1464
let options = ArrowReaderOptions :: new ( ) . with_page_index ( true ) ;
1479
1465
let builder = ParquetRecordBatchStreamBuilder :: new_with_options ( async_reader, options)
@@ -1535,11 +1521,7 @@ mod tests {
1535
1521
1536
1522
let selection = RowSelection :: from ( selectors) ;
1537
1523
1538
- let async_reader = TestReader {
1539
- data : data. clone ( ) ,
1540
- metadata : Default :: default ( ) ,
1541
- requests : Default :: default ( ) ,
1542
- } ;
1524
+ let async_reader = TestReader :: new ( data. clone ( ) ) ;
1543
1525
1544
1526
let options = ArrowReaderOptions :: new ( ) . with_page_index ( true ) ;
1545
1527
let builder = ParquetRecordBatchStreamBuilder :: new_with_options ( async_reader, options)
@@ -1566,6 +1548,70 @@ mod tests {
1566
1548
1567
1549
#[ tokio:: test]
1568
1550
async fn test_row_filter ( ) {
1551
+ let a = StringArray :: from_iter_values ( [ "a" , "b" , "b" , "b" , "c" , "c" ] ) ;
1552
+ let b = StringArray :: from_iter_values ( [ "1" , "2" , "3" , "4" , "5" , "6" ] ) ;
1553
+ let data = RecordBatch :: try_from_iter ( [
1554
+ ( "a" , Arc :: new ( a) as ArrayRef ) ,
1555
+ ( "b" , Arc :: new ( b) as ArrayRef ) ,
1556
+ ] )
1557
+ . unwrap ( ) ;
1558
+
1559
+ let mut buf = Vec :: with_capacity ( 1024 ) ;
1560
+ let mut writer = ArrowWriter :: try_new ( & mut buf, data. schema ( ) , None ) . unwrap ( ) ;
1561
+ writer. write ( & data) . unwrap ( ) ;
1562
+ writer. close ( ) . unwrap ( ) ;
1563
+
1564
+ let data: Bytes = buf. into ( ) ;
1565
+ let metadata = ParquetMetaDataReader :: new ( )
1566
+ . parse_and_finish ( & data)
1567
+ . unwrap ( ) ;
1568
+ let parquet_schema = metadata. file_metadata ( ) . schema_descr_ptr ( ) ;
1569
+
1570
+ let test = TestReader :: new ( data) ;
1571
+ let requests = test. requests . clone ( ) ;
1572
+
1573
+ let a_scalar = StringArray :: from_iter_values ( [ "b" ] ) ;
1574
+ let a_filter = ArrowPredicateFn :: new (
1575
+ ProjectionMask :: leaves ( & parquet_schema, vec ! [ 0 ] ) ,
1576
+ move |batch| eq ( batch. column ( 0 ) , & Scalar :: new ( & a_scalar) ) ,
1577
+ ) ;
1578
+
1579
+ let filter = RowFilter :: new ( vec ! [ Box :: new( a_filter) ] ) ;
1580
+
1581
+ let mask = ProjectionMask :: leaves ( & parquet_schema, vec ! [ 0 , 1 ] ) ;
1582
+ let stream = ParquetRecordBatchStreamBuilder :: new ( test)
1583
+ . await
1584
+ . unwrap ( )
1585
+ . with_projection ( mask. clone ( ) )
1586
+ . with_batch_size ( 1024 )
1587
+ . with_row_filter ( filter)
1588
+ . build ( )
1589
+ . unwrap ( ) ;
1590
+
1591
+ let batches: Vec < _ > = stream. try_collect ( ) . await . unwrap ( ) ;
1592
+ assert_eq ! ( batches. len( ) , 1 ) ;
1593
+
1594
+ let batch = & batches[ 0 ] ;
1595
+ assert_eq ! ( batch. num_columns( ) , 2 ) ;
1596
+
1597
+ // Filter should have kept only rows with "b" in column 0
1598
+ assert_eq ! (
1599
+ batch. column( 0 ) . as_ref( ) ,
1600
+ & StringArray :: from_iter_values( [ "b" , "b" , "b" ] )
1601
+ ) ;
1602
+ assert_eq ! (
1603
+ batch. column( 1 ) . as_ref( ) ,
1604
+ & StringArray :: from_iter_values( [ "2" , "3" , "4" ] )
1605
+ ) ;
1606
+
1607
+ // Should only have made 2 requests:
1608
+ // * First request fetches data for evaluating the predicate
1609
+ // * Second request fetches data for evaluating the projection
1610
+ assert_eq ! ( requests. lock( ) . unwrap( ) . len( ) , 2 ) ;
1611
+ }
1612
+
1613
+ #[ tokio:: test]
1614
+ async fn test_two_row_filters ( ) {
1569
1615
let a = StringArray :: from_iter_values ( [ "a" , "b" , "b" , "b" , "c" , "c" ] ) ;
1570
1616
let b = StringArray :: from_iter_values ( [ "1" , "2" , "3" , "4" , "5" , "6" ] ) ;
1571
1617
let c = Int32Array :: from_iter ( 0 ..6 ) ;
@@ -1587,11 +1633,7 @@ mod tests {
1587
1633
. unwrap ( ) ;
1588
1634
let parquet_schema = metadata. file_metadata ( ) . schema_descr_ptr ( ) ;
1589
1635
1590
- let test = TestReader {
1591
- data,
1592
- metadata : Default :: default ( ) ,
1593
- requests : Default :: default ( ) ,
1594
- } ;
1636
+ let test = TestReader :: new ( data) ;
1595
1637
let requests = test. requests . clone ( ) ;
1596
1638
1597
1639
let a_scalar = StringArray :: from_iter_values ( [ "b" ] ) ;
@@ -1634,6 +1676,9 @@ mod tests {
1634
1676
assert_eq ! ( val, 3 ) ;
1635
1677
1636
1678
// Should only have made 3 requests
1679
+ // * First request fetches data for evaluating the first predicate
1680
+ // * Second request fetches data for evaluating the second predicate
1681
+ // * Third request fetches data for evaluating the projection
1637
1682
assert_eq ! ( requests. lock( ) . unwrap( ) . len( ) , 3 ) ;
1638
1683
}
1639
1684
@@ -1664,11 +1709,7 @@ mod tests {
1664
1709
1665
1710
assert_eq ! ( metadata. num_row_groups( ) , 2 ) ;
1666
1711
1667
- let test = TestReader {
1668
- data,
1669
- metadata : Default :: default ( ) ,
1670
- requests : Default :: default ( ) ,
1671
- } ;
1712
+ let test = TestReader :: new ( data) ;
1672
1713
1673
1714
let stream = ParquetRecordBatchStreamBuilder :: new ( test. clone ( ) )
1674
1715
. await
@@ -1755,11 +1796,7 @@ mod tests {
1755
1796
1756
1797
assert_eq ! ( metadata. num_row_groups( ) , 1 ) ;
1757
1798
1758
- let async_reader = TestReader {
1759
- data : data. clone ( ) ,
1760
- metadata : Default :: default ( ) ,
1761
- requests : Default :: default ( ) ,
1762
- } ;
1799
+ let async_reader = TestReader :: new ( data. clone ( ) ) ;
1763
1800
1764
1801
let a_filter =
1765
1802
ArrowPredicateFn :: new ( ProjectionMask :: leaves ( & parquet_schema, vec ! [ 1 ] ) , |batch| {
@@ -1823,11 +1860,7 @@ mod tests {
1823
1860
1824
1861
assert_eq ! ( metadata. num_row_groups( ) , 1 ) ;
1825
1862
1826
- let async_reader = TestReader {
1827
- data : data. clone ( ) ,
1828
- metadata : Default :: default ( ) ,
1829
- requests : Default :: default ( ) ,
1830
- } ;
1863
+ let async_reader = TestReader :: new ( data. clone ( ) ) ;
1831
1864
1832
1865
let requests = async_reader. requests . clone ( ) ;
1833
1866
let ( _, fields) = parquet_to_arrow_schema_and_fields (
@@ -1893,11 +1926,7 @@ mod tests {
1893
1926
let path = format ! ( "{testdata}/alltypes_plain.parquet" ) ;
1894
1927
let data = Bytes :: from ( std:: fs:: read ( path) . unwrap ( ) ) ;
1895
1928
1896
- let async_reader = TestReader {
1897
- data : data. clone ( ) ,
1898
- metadata : Default :: default ( ) ,
1899
- requests : Default :: default ( ) ,
1900
- } ;
1929
+ let async_reader = TestReader :: new ( data. clone ( ) ) ;
1901
1930
1902
1931
let builder = ParquetRecordBatchStreamBuilder :: new ( async_reader)
1903
1932
. await
@@ -2036,11 +2065,7 @@ mod tests {
2036
2065
let testdata = arrow:: util:: test_util:: parquet_test_data ( ) ;
2037
2066
let path = format ! ( "{testdata}/data_index_bloom_encoding_stats.parquet" ) ;
2038
2067
let data = Bytes :: from ( std:: fs:: read ( path) . unwrap ( ) ) ;
2039
- let async_reader = TestReader {
2040
- data : data. clone ( ) ,
2041
- metadata : Default :: default ( ) ,
2042
- requests : Default :: default ( ) ,
2043
- } ;
2068
+ let async_reader = TestReader :: new ( data. clone ( ) ) ;
2044
2069
let builder = ParquetRecordBatchStreamBuilder :: new ( async_reader)
2045
2070
. await
2046
2071
. unwrap ( ) ;
@@ -2063,11 +2088,7 @@ mod tests {
2063
2088
}
2064
2089
2065
2090
async fn test_get_row_group_column_bloom_filter ( data : Bytes , with_length : bool ) {
2066
- let async_reader = TestReader {
2067
- data : data. clone ( ) ,
2068
- metadata : Default :: default ( ) ,
2069
- requests : Default :: default ( ) ,
2070
- } ;
2091
+ let async_reader = TestReader :: new ( data. clone ( ) ) ;
2071
2092
2072
2093
let mut builder = ParquetRecordBatchStreamBuilder :: new ( async_reader)
2073
2094
. await
@@ -2206,11 +2227,7 @@ mod tests {
2206
2227
. unwrap ( ) ;
2207
2228
let parquet_schema = metadata. file_metadata ( ) . schema_descr_ptr ( ) ;
2208
2229
2209
- let test = TestReader {
2210
- data,
2211
- metadata : Default :: default ( ) ,
2212
- requests : Default :: default ( ) ,
2213
- } ;
2230
+ let test = TestReader :: new ( data) ;
2214
2231
let requests = test. requests . clone ( ) ;
2215
2232
2216
2233
let a_scalar = StringArray :: from_iter_values ( [ "b" ] ) ;
@@ -2261,6 +2278,9 @@ mod tests {
2261
2278
assert_eq ! ( val, 3 ) ;
2262
2279
2263
2280
// Should only have made 3 requests
2281
+ // * First request fetches data for evaluating the first predicate
2282
+ // * Second request fetches data for evaluating the second predicate
2283
+ // * Third request fetches data for evaluating the projection
2264
2284
assert_eq ! ( requests. lock( ) . unwrap( ) . len( ) , 3 ) ;
2265
2285
}
2266
2286
0 commit comments