apache · benrobby · Aug 19, 2025 · Aug 19, 2025 · Aug 20, 2025 · Aug 20, 2025
diff --git a/python/pyspark/sql/tests/arrow/test_arrow_python_udf.py b/python/pyspark/sql/tests/arrow/test_arrow_python_udf.py
@@ -21,7 +21,7 @@
 from pyspark.sql import Row
 from pyspark.sql.functions import udf
 from pyspark.sql.tests.test_udf import BaseUDFTestsMixin
-from pyspark.sql.types import VarcharType
+from pyspark.sql.types import DayTimeIntervalType, VarcharType
 from pyspark.testing.sqlutils import (
     have_pandas,
     have_pyarrow,
@@ -243,6 +243,26 @@ def test_udf_use_arrow_and_session_conf(self):
                 udf(lambda x: str(x), useArrow=False).evalType, PythonEvalType.SQL_BATCHED_UDF
             )
 
+    def test_day_time_interval_type_casting(self):
+        """Test that DayTimeIntervalType UDFs work with Arrow and preserve field specifications."""
+
+        # HOUR TO SECOND
+        @udf(useArrow=True, returnType=DayTimeIntervalType(1, 3))
+        def return_interval(x):
+            return x
+
+        df = self.spark.sql("SELECT INTERVAL '200:13:50.3' HOUR TO SECOND as value").select(
+            return_interval("value").alias("result")
+        )
+        self.assertEqual(df.schema.fields[0].dataType, DayTimeIntervalType(1, 3))
+        self.assertIsNotNone(df.collect()[0]["result"])
+
+        df2 = self.spark.sql("SELECT INTERVAL '1 10:30:45.123' DAY TO SECOND as value").select(
+            return_interval("value").alias("result")
+        )
+        self.assertEqual(df.schema.fields[0].dataType, DayTimeIntervalType(1, 3))
+        self.assertIsNotNone(df2.collect()[0]["result"])
+
 
 @unittest.skipIf(
     not have_pandas or not have_pyarrow, pandas_requirement_message or pyarrow_requirement_message

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/python/ArrowEvalPythonExec.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/python/ArrowEvalPythonExec.scala
@@ -27,7 +27,7 @@ import org.apache.spark.sql.errors.QueryExecutionErrors
 import org.apache.spark.sql.execution.SparkPlan
 import org.apache.spark.sql.execution.metric.SQLMetric
 import org.apache.spark.sql.execution.python.EvalPythonExec.ArgumentMetadata
-import org.apache.spark.sql.types.{StructType, UserDefinedType}
+import org.apache.spark.sql.types.{ArrayType, DataType, DayTimeIntervalType, MapType, StructType, UserDefinedType}
 import org.apache.spark.sql.types.DataType.equalsIgnoreCompatibleCollation
 
 /**
@@ -124,6 +124,41 @@ class ArrowEvalPythonEvaluatorFactory(
     profiler: Option[String])
   extends EvalPythonEvaluatorFactory(childOutput, udfs, output) {
 
+  private def typesEqual(
+      expectedTypes: Seq[DataType],
+      actualTypes: Seq[DataType]): Boolean = {
+    expectedTypes.length == actualTypes.length &&
+    expectedTypes.zip(actualTypes).forall { case (expected, actual) =>
+      typesEqualRecursive(expected, actual)
+    }
+  }
+
+  private def typesEqualRecursive(expected: DataType, actual: DataType): Boolean = {
+    (expected, actual) match {
+      case (expected: DayTimeIntervalType, actual: DayTimeIntervalType) =>
+        // Use lenient type checking that treats DayTimeIntervalType variants as compatible
+        // Arrow always returns the broadest range, so as long as we're not losing information,
+        // we can consider the types equal.
+        actual.startField <= expected.startField && expected.endField <= actual.endField
+
+      case (expected: ArrayType, actual: ArrayType) =>
+        typesEqualRecursive(expected.elementType, actual.elementType)
+
+      case (expected: MapType, actual: MapType) =>
+        typesEqualRecursive(expected.keyType, actual.keyType) &&
+        typesEqualRecursive(expected.valueType, actual.valueType)
+
+      case (expected: StructType, actual: StructType) =>
+        expected.fields.length == actual.fields.length &&
+        expected.fields.zip(actual.fields).forall { case (expectedField, actualField) =>
+          expectedField.name == actualField.name &&
+          typesEqualRecursive(expectedField.dataType, actualField.dataType)
+        }
+
+      case _ => equalsIgnoreCompatibleCollation(expected, actual)
+    }
+  }
+
   override def evaluate(
       funcs: Seq[(ChainedPythonFunctions, Long)],
       argMetas: Array[Array[ArgumentMetadata]],
@@ -152,10 +187,12 @@ class ArrowEvalPythonEvaluatorFactory(
 
     columnarBatchIter.flatMap { batch =>
       val actualDataTypes = (0 until batch.numCols()).map(i => batch.column(i).dataType())
-      if (!equalsIgnoreCompatibleCollation(outputTypes, actualDataTypes)) {
+
+      if (!typesEqual(outputTypes, actualDataTypes)) {
         throw QueryExecutionErrors.arrowDataTypeMismatchError(
           "pandas_udf()", outputTypes, actualDataTypes)
       }
+
       batch.rowIterator.asScala
     }
   }