bump dsl version,flatten,rebase

nameexhaustion · nameexhaustion · commit 3b5d2a567f95 · 2025-05-07T22:35:48.000+10:00
diff --git a/py-polars/docs/source/reference/io.rst b/py-polars/docs/source/reference/io.rst
@@ -154,3 +154,12 @@ Configuration for cloud credential provisioning.
    CredentialProviderAWS
    CredentialProviderAzure
    CredentialProviderGCP
+
+Scan Cast Options
+~~~~~~~~~~~~~~~~~
+Configuration for type-casting during scans.
+
+.. autosummary::
+   :toctree: api/
+
+   ScanCastOptions
diff --git a/py-polars/polars/__init__.py b/py-polars/polars/__init__.py
@@ -175,6 +175,7 @@
     PartitionByKey,
     PartitionMaxSize,
     PartitionParted,
+    ScanCastOptions,
     defer,
     read_avro,
     read_clipboard,
@@ -284,6 +285,7 @@
     "PartitionByKey",
     "PartitionMaxSize",
     "PartitionParted",
+    "ScanCastOptions",
     "read_avro",
     "read_clipboard",
     "read_csv",
diff --git a/py-polars/polars/io/__init__.py b/py-polars/polars/io/__init__.py
@@ -1,6 +1,7 @@
 """Functions for reading data."""
 
 from polars.io.avro import read_avro
+from polars.io.cast_options import ScanCastOptions
 from polars.io.clipboard import read_clipboard
 from polars.io.csv import read_csv, read_csv_batched, scan_csv
 from polars.io.database import read_database, read_database_uri
@@ -35,6 +36,7 @@
     "KeyedPartition",
     "BasePartitionContext",
     "KeyedPartitionContext",
+    "ScanCastOptions",
     "read_avro",
     "read_clipboard",
     "read_csv",
diff --git a/py-polars/polars/io/cast_options.py b/py-polars/polars/io/cast_options.py
@@ -0,0 +1,97 @@
+from __future__ import annotations
+
+from typing import TYPE_CHECKING, Literal
+
+from polars._utils.unstable import issue_unstable_warning
+
+if TYPE_CHECKING:
+    from collections.abc import Collection
+
+    from typing_extensions import TypeAlias
+
+
+FloatCastOption: TypeAlias = Literal["upcast", "downcast"]
+DatetimeCastOption: TypeAlias = Literal["nanosecond-downcast", "convert-timezone"]
+
+
+class ScanCastOptions:
+    """Options for type-casting when scanning files."""
+
+    def __init__(
+        self,
+        *,
+        integer_cast: Literal["upcast", "forbid"] = "forbid",
+        float_cast: Literal["forbid"]
+        | FloatCastOption
+        | Collection[FloatCastOption] = "forbid",
+        datetime_cast: Literal["forbid"]
+        | DatetimeCastOption
+        | Collection[DatetimeCastOption] = "forbid",
+        missing_struct_fields: Literal["insert", "raise"] = "raise",
+        extra_struct_fields: Literal["ignore", "raise"] = "raise",
+        _internal_call: bool = False,
+    ) -> None:
+        """
+        Configuration for type-casting of columns when reading files.
+
+        This can be useful for scanning datasets with schemas that have been
+        modified. This configuration object is generally passed to a supported
+        `scan_*` function via the `cast_options` parameter.
+
+        .. warning::
+                This functionality is considered **unstable**. It may be changed
+                at any point without it being considered a breaking change.
+
+        Parameters
+        ----------
+        integer_cast
+            Configuration for casting from integer types:
+
+            * `upcast`: Allow lossless casting to wider integer types.
+            * `forbid`: Raises an error if dtypes do not match.
+
+        float_cast
+            Configuration for casting from float types:
+
+            * `upcast`: Allow casting to higher precision float types.
+            * `downcast`: Allow casting to lower precision float types.
+            * `forbid`: Raises an error if dtypes do not match.
+
+        datetime_cast
+            Configuration for casting from datetime types:
+
+            * `nanosecond-downcast`: Allow nanosecond precision datetime to be \
+            downcasted to any lower precision. This has a similar effect to \
+            PyArrow's `coerce_int96_timestamp_unit`.
+            * `convert-timezone`: Allow casting to a different timezone.
+            * `forbid`: Raises an error if dtypes do not match.
+
+        missing_struct_fields
+            Configuration for behavior when struct fields defined in the schema
+            are missing from the data:
+
+            * `insert`: Inserts the missing fields.
+            * `raise`: Raises an error.
+
+        extra_struct_fields
+            Configuration for behavior when extra struct fields outside of the
+            defined schema are encountered in the data:
+
+            * `ignore`: Silently ignores.
+            * `raise`: Raises an error.
+
+        """
+        if not _internal_call:
+            issue_unstable_warning("ScanCastOptions is considered unstable.")
+
+        self.integer_cast = integer_cast
+        self.float_cast = float_cast
+        self.datetime_cast = datetime_cast
+        self.missing_struct_fields = missing_struct_fields
+        self.extra_struct_fields = extra_struct_fields
+
+    # This is called from the Rust-side, we have it so that we don't accidentally
+    # print unstable messages.
+    @staticmethod
+    def _default() -> ScanCastOptions:
+        return ScanCastOptions(_internal_call=True)
diff --git a/py-polars/polars/io/parquet/functions.py b/py-polars/polars/io/parquet/functions.py
@@ -35,6 +35,7 @@
 
     from polars import DataFrame, DataType, LazyFrame
     from polars._typing import FileSource, ParallelStrategy, SchemaDict
+    from polars.io.cast_options import ScanCastOptions
     from polars.io.cloud import CredentialProviderFunction
     from polars.io.cloud.credential_provider._builder import CredentialProviderBuilder
 
@@ -384,6 +385,7 @@ def scan_parquet(
     retries: int = 2,
     include_file_paths: str | None = None,
     allow_missing_columns: bool = False,
+    cast_options: ScanCastOptions | None = None,
 ) -> LazyFrame:
     """
     Lazily read from a local or cloud-hosted parquet file (or files).
@@ -491,6 +493,13 @@ def scan_parquet(
         raise an error. However, if `allow_missing_columns` is set to
         `True`, a full-NULL column is returned instead of erroring for the files
         that do not contain the column.
+    cast_options
+        Configuration for column type-casting during scans. Useful for datasets
+        containing files that have differing schemas.
+
+        .. warning::
+            This functionality is considered **unstable**. It may be changed
+            at any point without it being considered a breaking change.
 
     See Also
     --------
@@ -522,6 +531,10 @@ def scan_parquet(
         msg = "the `hive_schema` parameter of `scan_parquet` is considered unstable."
         issue_unstable_warning(msg)
 
+    if cast_options is not None:
+        msg = "The `cast_options` parameter of `scan_parquet` is considered unstable."
+        issue_unstable_warning(msg)
+
     if isinstance(source, (str, Path)):
         source = normalize_filepath(source, check_not_directory=False)
     elif is_path_or_str_sequence(source):
@@ -553,6 +566,7 @@ def scan_parquet(
         glob=glob,
         include_file_paths=include_file_paths,
         allow_missing_columns=allow_missing_columns,
+        cast_options=cast_options,
     )
 
 
@@ -577,6 +591,7 @@ def _scan_parquet_impl(
     retries: int = 2,
     include_file_paths: str | None = None,
     allow_missing_columns: bool = False,
+    cast_options: ScanCastOptions | None = None,
 ) -> LazyFrame:
     if isinstance(source, list):
         sources = source
@@ -610,5 +625,6 @@ def _scan_parquet_impl(
         glob=glob,
         include_file_paths=include_file_paths,
         allow_missing_columns=allow_missing_columns,
+        cast_options=cast_options,
     )
     return wrap_ldf(pylf)
diff --git a/py-polars/tests/unit/io/test_lazy_parquet.py b/py-polars/tests/unit/io/test_lazy_parquet.py
@@ -805,7 +805,7 @@ def test_parquet_schema_arg(
 
     with pytest.raises(
         pl.exceptions.SchemaError,
-        match="data type mismatch for column b: expected: i8, found: i64",
+        match="data type mismatch for column b: incoming: Int64 != target: Int8",
     ):
         lf.collect(engine="streaming" if streaming else "in-memory")
 
diff --git a/py-polars/tests/unit/io/test_multiscan.py b/py-polars/tests/unit/io/test_multiscan.py
@@ -338,7 +338,10 @@ def test_schema_mismatch_type_mismatch(
         if scan is pl.scan_ndjson
         else pytest.raises(
             pl.exceptions.SchemaError,
-            match="data type mismatch for column xyz_col: expected: i64, found: str",
+            match=(
+                "data type mismatch for column xyz_col: "
+                "incoming: String != target: Int64"
+            ),
         )
     )
 
diff --git a/py-polars/tests/unit/io/test_scan_cast.py b/py-polars/tests/unit/io/test_scan_cast.py

Original file line number	Diff line number	Diff line change
`@@ -338,7 +338,10 @@ def test_schema_mismatch_type_mismatch(`
`338`	`338`	`if scan is pl.scan_ndjson`
`339`	`339`	`else pytest.raises(`
`340`	`340`	`pl.exceptions.SchemaError,`
`341`		`- match="data type mismatch for column xyz_col: expected: i64, found: str",`
	`341`	`+ match=(`
	`342`	`+ "data type mismatch for column xyz_col: "`
	`343`	`+ "incoming: String != target: Int64"`
	`344`	`+ ),`
`342`	`345`	`)`
`343`	`346`	`)`
`344`	`347`