Skip to content

Commit b078747

Browse files
committed
Merge branch 'main' of github.com:Groennbeck/datafusion-comet into array-size
2 parents afc926d + a1e6a39 commit b078747

File tree

1,796 files changed

+277326
-33635
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

1,796 files changed

+277326
-33635
lines changed

.github/actions/java-test/action.yaml

+1-1
Original file line numberDiff line numberDiff line change
@@ -67,4 +67,4 @@ runs:
6767

6868
- name: Upload coverage results
6969
if: ${{ inputs.upload-test-reports == 'true' }}
70-
uses: codecov/codecov-action@v3 # uses v3 as it allows tokenless uploading
70+
uses: codecov/codecov-action@v5

.github/workflows/pr_build.yml

+1-2
Original file line numberDiff line numberDiff line change
@@ -211,8 +211,7 @@ jobs:
211211
uses: ./.github/actions/java-test
212212
with:
213213
maven_opts: -Pspark-${{ matrix.spark-version }}
214-
# https://github.com/codecov/codecov-action/issues/1549
215-
# upload-test-reports: true
214+
upload-test-reports: true
216215

217216
macos-aarch64-test-with-spark4_0:
218217
strategy:

.github/workflows/spark_sql_test.yml

+1-1
Original file line numberDiff line numberDiff line change
@@ -71,7 +71,7 @@ jobs:
7171
with:
7272
spark-version: ${{ matrix.spark-version.full }}
7373
spark-short-version: ${{ matrix.spark-version.short }}
74-
comet-version: '0.5.0-SNAPSHOT' # TODO: get this from pom.xml
74+
comet-version: '0.6.0-SNAPSHOT' # TODO: get this from pom.xml
7575
- name: Run Spark tests
7676
run: |
7777
cd apache-spark

.github/workflows/spark_sql_test_ansi.yml

+1-1
Original file line numberDiff line numberDiff line change
@@ -69,7 +69,7 @@ jobs:
6969
with:
7070
spark-version: ${{ matrix.spark-version.full }}
7171
spark-short-version: ${{ matrix.spark-version.short }}
72-
comet-version: '0.5.0-SNAPSHOT' # TODO: get this from pom.xml
72+
comet-version: '0.6.0-SNAPSHOT' # TODO: get this from pom.xml
7373
- name: Run Spark tests
7474
run: |
7575
cd apache-spark

README.md

+6-13
Original file line numberDiff line numberDiff line change
@@ -46,30 +46,23 @@ The following chart shows the time it takes to run the 22 TPC-H queries against
4646
using a single executor with 8 cores. See the [Comet Benchmarking Guide](https://datafusion.apache.org/comet/contributor-guide/benchmarking.html)
4747
for details of the environment used for these benchmarks.
4848

49-
When using Comet, the overall run time is reduced from 615 seconds to 364 seconds, a 1.7x speedup, with query 1
50-
running 9x faster than Spark.
49+
When using Comet, the overall run time is reduced from 640 seconds to 331 seconds, very close to a 2x speedup.
5150

52-
Running the same queries with DataFusion standalone (without Spark) using the same number of cores results in a 3.6x
53-
speedup compared to Spark.
51+
![](docs/source/_static/images/benchmark-results/0.5.0/tpch_allqueries.png)
5452

55-
Comet is not yet achieving full DataFusion speeds in all cases, but with future work we aim to provide a 2x-4x speedup
56-
for a broader set of queries.
53+
Here is a breakdown showing relative performance of Spark and Comet for each TPC-H query.
5754

58-
![](docs/source/_static/images/benchmark-results/0.4.0/tpch_allqueries.png)
59-
60-
Here is a breakdown showing relative performance of Spark, Comet, and DataFusion for each TPC-H query.
61-
62-
![](docs/source/_static/images/benchmark-results/0.4.0/tpch_queries_compare.png)
55+
![](docs/source/_static/images/benchmark-results/0.5.0/tpch_queries_compare.png)
6356

6457
The following charts shows how much Comet currently accelerates each query from the benchmark.
6558

6659
### Relative speedup
6760

68-
![](docs/source/_static/images/benchmark-results/0.4.0/tpch_queries_speedup_rel.png)
61+
![](docs/source/_static/images/benchmark-results/0.5.0/tpch_queries_speedup_rel.png)
6962

7063
### Absolute speedup
7164

72-
![](docs/source/_static/images/benchmark-results/0.4.0/tpch_queries_speedup_abs.png)
65+
![](docs/source/_static/images/benchmark-results/0.5.0/tpch_queries_speedup_abs.png)
7366

7467
These benchmarks can be reproduced in any environment using the documentation in the
7568
[Comet Benchmarking Guide](https://datafusion.apache.org/comet/contributor-guide/benchmarking.html). We encourage

common/pom.xml

+1-1
Original file line numberDiff line numberDiff line change
@@ -26,7 +26,7 @@ under the License.
2626
<parent>
2727
<groupId>org.apache.datafusion</groupId>
2828
<artifactId>comet-parent-spark${spark.version.short}_${scala.binary.version}</artifactId>
29-
<version>0.5.0-SNAPSHOT</version>
29+
<version>0.6.0-SNAPSHOT</version>
3030
<relativePath>../pom.xml</relativePath>
3131
</parent>
3232

common/src/main/java/org/apache/comet/parquet/AbstractColumnReader.java

+14
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,7 @@
2323
import org.slf4j.LoggerFactory;
2424

2525
import org.apache.parquet.column.ColumnDescriptor;
26+
import org.apache.parquet.schema.Type;
2627
import org.apache.spark.sql.types.DataType;
2728
import org.apache.spark.sql.types.TimestampNTZType$;
2829

@@ -36,6 +37,9 @@ public abstract class AbstractColumnReader implements AutoCloseable {
3637
/** The Spark data type. */
3738
protected final DataType type;
3839

40+
/** The Spark data type. */
41+
protected final Type fieldType;
42+
3943
/** Parquet column descriptor. */
4044
protected final ColumnDescriptor descriptor;
4145

@@ -61,13 +65,23 @@ public abstract class AbstractColumnReader implements AutoCloseable {
6165

6266
public AbstractColumnReader(
6367
DataType type,
68+
Type fieldType,
6469
ColumnDescriptor descriptor,
6570
boolean useDecimal128,
6671
boolean useLegacyDateTimestamp) {
6772
this.type = type;
73+
this.fieldType = fieldType;
6874
this.descriptor = descriptor;
6975
this.useDecimal128 = useDecimal128;
7076
this.useLegacyDateTimestamp = useLegacyDateTimestamp;
77+
}
78+
79+
public AbstractColumnReader(
80+
DataType type,
81+
ColumnDescriptor descriptor,
82+
boolean useDecimal128,
83+
boolean useLegacyDateTimestamp) {
84+
this(type, null, descriptor, useDecimal128, useLegacyDateTimestamp);
7185
TypeUtil.checkParquetType(descriptor, type);
7286
}
7387

common/src/main/java/org/apache/comet/parquet/BatchReader.java

+1-1
Original file line numberDiff line numberDiff line change
@@ -272,7 +272,7 @@ public void init() throws URISyntaxException, IOException {
272272
requestedSchema =
273273
CometParquetReadSupport.clipParquetSchema(
274274
requestedSchema, sparkSchema, isCaseSensitive, useFieldId, ignoreMissingIds);
275-
if (requestedSchema.getColumns().size() != sparkSchema.size()) {
275+
if (requestedSchema.getFieldCount() != sparkSchema.size()) {
276276
throw new IllegalArgumentException(
277277
String.format(
278278
"Spark schema has %d columns while " + "Parquet schema has %d columns",

common/src/main/java/org/apache/comet/parquet/Native.java

+52
Original file line numberDiff line numberDiff line change
@@ -234,4 +234,56 @@ public static native void setPageV2(
234234
* @param handle the handle to the native Parquet column reader
235235
*/
236236
public static native void closeColumnReader(long handle);
237+
238+
///////////// Arrow Native Parquet Reader APIs
239+
// TODO: Add partitionValues(?), improve requiredColumns to use a projection mask that corresponds
240+
// to arrow.
241+
// Add batch size, datetimeRebaseModeSpec, metrics(how?)...
242+
243+
/**
244+
* Initialize a record batch reader for a PartitionedFile
245+
*
246+
* @param filePath
247+
* @param start
248+
* @param length
249+
* @return a handle to the record batch reader, used in subsequent calls.
250+
*/
251+
public static native long initRecordBatchReader(
252+
String filePath,
253+
long fileSize,
254+
long start,
255+
long length,
256+
byte[] requiredSchema,
257+
String sessionTimezone);
258+
259+
// arrow native version of read batch
260+
/**
261+
* Read the next batch of data into memory on native side
262+
*
263+
* @param handle
264+
* @return the number of rows read
265+
*/
266+
public static native int readNextRecordBatch(long handle);
267+
268+
// arrow native equivalent of currentBatch. 'columnNum' is number of the column in the record
269+
// batch
270+
/**
271+
* Load the column corresponding to columnNum in the currently loaded record batch into JVM
272+
*
273+
* @param handle
274+
* @param columnNum
275+
* @param arrayAddr
276+
* @param schemaAddr
277+
*/
278+
public static native void currentColumnBatch(
279+
long handle, int columnNum, long arrayAddr, long schemaAddr);
280+
281+
// arrow native version to close record batch reader
282+
283+
/**
284+
* Close the record batch reader. Free the resources
285+
*
286+
* @param handle
287+
*/
288+
public static native void closeRecordBatchReader(long handle);
237289
}

0 commit comments

Comments
 (0)