|
| 1 | +@startuml Spark-dotnet-sequence-diagram-udf-data |
| 2 | +title "Sequence Diagram for Processing Pipeline with Spark .NET: UDF & Data retrieval" |
| 3 | + |
| 4 | +skinparam dpi 200 |
| 5 | +skinparam BoxPadding 10 |
| 6 | + |
| 7 | +actor "User" as user |
| 8 | + |
| 9 | +box "Master Node" |
| 10 | +participant "Spark: Master" as spark_master |
| 11 | +participant "JVM<->.NET Bridge" as bridge |
| 12 | +participant "MyProgram.exe:\nUser .NET App" as dotnet_master |
| 13 | +participant "Microsoft.Spark\n(NuGet Package)" as dotnet_nuget |
| 14 | +end box |
| 15 | + |
| 16 | +box "Worker Node\n(One of Many)" |
| 17 | +participant "Spark: Worker" as spark_worker |
| 18 | +participant "Microsoft.Spark.Worker" as dotnet_worker |
| 19 | +end box |
| 20 | + |
| 21 | +user -> spark_master: Executes \n**spark-submit** microsoft-spark-xx.jar\n--files MyUdfs.dll MyProgram.zip |
| 22 | +activate spark_master |
| 23 | + |
| 24 | +spark_master -> bridge: Load and start executing jar |
| 25 | +activate bridge |
| 26 | +bridge -> dotnet_master: Start MyProgram |
| 27 | +deactivate bridge |
| 28 | + |
| 29 | +activate dotnet_master |
| 30 | +dotnet_master -> dotnet_nuget: Build SparkSession |
| 31 | +deactivate dotnet_master |
| 32 | +activate dotnet_nuget |
| 33 | + |
| 34 | +dotnet_nuget -> bridge: Connect to socket,\nRequest Spark Session creation |
| 35 | +activate bridge |
| 36 | +bridge -> spark_master: Request Spark Session creation |
| 37 | +return Reference to JVM object SparkSession |
| 38 | +return Session |
| 39 | +activate dotnet_master |
| 40 | + |
| 41 | +group "Register UDF" |
| 42 | + note over dotnet_master |
| 43 | + ""var df = LoadDataFromSomeWhere();"" // This part is ommitted |
| 44 | + ""Func<Column, Column> udfArray ="" |
| 45 | + ""Udf<string, string[]>(str => [str, $"{str}-{str.Length}"]);"" |
| 46 | + end note |
| 47 | + |
| 48 | + dotnet_master -> dotnet_nuget: Func<> object |
| 49 | + deactivate dotnet_master |
| 50 | + activate dotnet_nuget |
| 51 | + dotnet_nuget -> dotnet_nuget: Serialize Func using binary serializer |
| 52 | + dotnet_nuget -> bridge: Invoke UDF creation,\nPass serialized UDF as a parameter |
| 53 | + deactivate dotnet_nuget |
| 54 | + activate bridge |
| 55 | + bridge -> spark_master: Register UDF as a PythonFunction,\nSpecify Microsoft.Spark.Worker.exe instead of Python.exe\nDeclare serialized UDF as an argument |
| 56 | + deactivate bridge |
| 57 | + |
| 58 | + spark_master -> spark_master: Register a Python UDF |
| 59 | + |
| 60 | + spark_master --> bridge |
| 61 | + activate bridge |
| 62 | + bridge --> dotnet_nuget: UDF JVM reference |
| 63 | + deactivate bridge |
| 64 | + |
| 65 | + activate dotnet_nuget |
| 66 | + dotnet_nuget --> dotnet_master |
| 67 | + deactivate dotnet_nuget |
| 68 | + activate dotnet_master |
| 69 | +end |
| 70 | + |
| 71 | +group "Invoke UDF" |
| 72 | + note over dotnet_master |
| 73 | + // Cache() needed for immediate invocation, |
| 74 | + // otherwise df invoked lazily when needed |
| 75 | + ""var arrayDF ="" |
| 76 | + ""df.Select(Explode(udfArray(df["value"])))"" |
| 77 | + "".Cache();"" |
| 78 | + end note |
| 79 | + |
| 80 | + |
| 81 | + dotnet_master -> dotnet_nuget |
| 82 | + deactivate dotnet_master |
| 83 | + activate dotnet_nuget |
| 84 | + |
| 85 | + dotnet_nuget -> bridge: Pass calls to bridge |
| 86 | + deactivate dotnet_nuget |
| 87 | + activate bridge |
| 88 | + bridge -> spark_master: Load data,\nGenerate execution graph,\nCreate RDD |
| 89 | + deactivate bridge |
| 90 | + |
| 91 | + spark_master -> spark_worker: Create tasks for processing partitions of RDD |
| 92 | + activate spark_worker |
| 93 | + spark_worker -> dotnet_worker: Start process,\nInitiate socket connection,\nPass task content and serialized UDF |
| 94 | + activate dotnet_worker |
| 95 | + |
| 96 | + dotnet_worker -> dotnet_worker: Deserialize Func and execute it\nPass arguments received from Spark worker |
| 97 | + return UDF execution result |
| 98 | + return |
| 99 | + |
| 100 | + spark_master -> spark_master: Aggregate results from workers |
| 101 | + spark_master --> bridge |
| 102 | + activate bridge |
| 103 | + bridge --> dotnet_nuget |
| 104 | + deactivate bridge |
| 105 | + activate dotnet_nuget |
| 106 | + dotnet_nuget --> dotnet_master |
| 107 | + deactivate dotnet_nuget |
| 108 | + activate dotnet_master |
| 109 | +end |
| 110 | + |
| 111 | +group "Fetch Dataset in .NET Memory" |
| 112 | + note over dotnet_master |
| 113 | + ""var result ="" |
| 114 | + ""arrayDF.Collect().ToList();"" |
| 115 | + end note |
| 116 | + |
| 117 | + dotnet_master -> dotnet_nuget: Collect dataset |
| 118 | + deactivate dotnet_master |
| 119 | + activate dotnet_nuget |
| 120 | + dotnet_nuget -> bridge: Request dataset collection |
| 121 | + deactivate dotnet_nuget |
| 122 | + |
| 123 | + activate bridge |
| 124 | + bridge -> spark_master: .Collect() request |
| 125 | + |
| 126 | + deactivate bridge |
| 127 | + |
| 128 | + spark_master --> bridge: Collected data |
| 129 | + activate bridge |
| 130 | + |
| 131 | + bridge --> dotnet_nuget: Collected data |
| 132 | + deactivate bridge |
| 133 | + activate dotnet_nuget |
| 134 | + dotnet_nuget -> bridge: Initiate broadcast of all rows via socket |
| 135 | + deactivate dotnet_nuget |
| 136 | + activate bridge |
| 137 | + |
| 138 | + bridge -> dotnet_nuget: Entire dataset serialized in Python Pickle format\n**Expensive operation** |
| 139 | + |
| 140 | + deactivate bridge |
| 141 | + |
| 142 | + activate dotnet_nuget |
| 143 | + dotnet_nuget --> dotnet_master: Deserialized row collection |
| 144 | + deactivate dotnet_nuget |
| 145 | + activate dotnet_master |
| 146 | + |
| 147 | +end |
| 148 | + |
| 149 | +activate dotnet_master |
| 150 | +dotnet_master --> bridge: Execution complete |
| 151 | +deactivate dotnet_master |
| 152 | +activate bridge |
| 153 | +bridge --> spark_master: Execution complete1 |
| 154 | +deactivate bridge |
| 155 | +return Execution complete. |
| 156 | +@enduml |
0 commit comments