[SPARK-48771][SQL] Speed up LogicalPlanIntegrity.validateExprIdUniqueness for large query plans

kelvinjian-db · cloud-fan · commit 82fd7bbca09b · 2024-07-09T14:57:31.000+08:00
### What changes were proposed in this pull request? This PR rewrites `LogicalPlanIntegrity.hasUniqueExprIdsForOutput` to only traverse the query plan once and avoids expensive Scala collections operations like `.flatten`, `.groupBy`, and `.distinct`. ### Why are the changes needed? Speeds up query compilation when plan validation is enabled. ### Does this PR introduce _any_ user-facing change? No. ### How was this patch tested? Made sure existing UTs pass. ### Was this patch authored or co-authored using generative AI tooling? No. Closes #47170 from kelvinjian-db/SPARK-48771-speed-up. Authored-by: Kelvin Jiang <kelvin.jiang@databricks.com> Signed-off-by: Wenchen Fan <wenchen@databricks.com>
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/LogicalPlan.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/LogicalPlan.scala
@@ -17,6 +17,8 @@
 
 package org.apache.spark.sql.catalyst.plans.logical
 
+import scala.collection.mutable
+
 import org.apache.spark.SparkUnsupportedOperationException
 import org.apache.spark.internal.Logging
 import org.apache.spark.sql.AnalysisException
@@ -29,7 +31,7 @@ import org.apache.spark.sql.catalyst.trees.TreePattern.{LOGICAL_QUERY_STAGE, Tre
 import org.apache.spark.sql.catalyst.types.DataTypeUtils
 import org.apache.spark.sql.catalyst.util.MetadataColumnHelper
 import org.apache.spark.sql.errors.{QueryCompilationErrors, QueryExecutionErrors}
-import org.apache.spark.sql.types.StructType
+import org.apache.spark.sql.types.{DataType, StructType}
 
 
 abstract class LogicalPlan
@@ -314,31 +316,35 @@ object LogicalPlanIntegrity {
    * in plan output. Returns the error message if the check does not pass.
    */
   def hasUniqueExprIdsForOutput(plan: LogicalPlan): Option[String] = {
-    val exprIds = plan.collect { case p if canGetOutputAttrs(p) =>
-      // NOTE: we still need to filter resolved expressions here because the output of
-      // some resolved logical plans can have unresolved references,
-      // e.g., outer references in `ExistenceJoin`.
-      p.output.filter(_.resolved).map { a => (a.exprId, a.dataType.asNullable) }
-    }.flatten
+    // SPARK-48771: rewritten using mutable collections to improve this function's performance and
+    // avoid unnecessary traversals of the query plan.
+    val exprIds = mutable.HashMap.empty[ExprId, mutable.HashSet[DataType]]
+    val ignoredExprIds = mutable.HashSet.empty[ExprId]
 
-    val ignoredExprIds = plan.collect {
+    plan.foreach {
       // NOTE: `Union` currently reuses input `ExprId`s for output references, but we cannot
       // simply modify the code for assigning new `ExprId`s in `Union#output` because
       // the modification will make breaking changes (See SPARK-32741(#29585)).
       // So, this check just ignores the `exprId`s of `Union` output.
-      case u: Union if u.resolved => u.output.map(_.exprId)
-    }.flatten.toSet
-
-    val groupedDataTypesByExprId = exprIds.filterNot { case (exprId, _) =>
-      ignoredExprIds.contains(exprId)
-    }.groupBy(_._1).values.map(_.distinct)
+      case u: Union if u.resolved =>
+        u.output.foreach(ignoredExprIds += _.exprId)
+      case p if canGetOutputAttrs(p) =>
+        p.output.foreach { a =>
+          // NOTE: we still need to filter resolved expressions here because the output of
+          // some resolved logical plans can have unresolved references,
+          // e.g., outer references in `ExistenceJoin`.
+          if (a.resolved) {
+            val prevTypes = exprIds.getOrElseUpdate(a.exprId, mutable.HashSet.empty[DataType])
+            prevTypes += a.dataType.asNullable
+          }
+        }
+      case _ =>
+    }
 
-    groupedDataTypesByExprId.collectFirst {
-      case group if group.length > 1 =>
-        val exprId = group.head._1
-        val types = group.map(_._2.sql)
+    exprIds.collectFirst {
+      case (exprId, types) if types.size > 1 && !ignoredExprIds.contains(exprId) =>
         s"Multiple attributes have the same expression ID ${exprId.id} but different data types: " +
-          types.mkString(", ") + ". The plan tree:\n" + plan.treeString
+          types.map(_.sql).mkString(", ") + ". The plan tree:\n" + plan.treeString
     }
   }