microsoft
diff --git a/‎onnxruntime/contrib_ops/cpu/nchwc_ops.cc
+26-12 b/‎onnxruntime/contrib_ops/cpu/nchwc_ops.cc
+26-12
diff --git a/‎onnxruntime/contrib_ops/cpu/nchwc_ops.h
+2-2 b/‎onnxruntime/contrib_ops/cpu/nchwc_ops.h
+2-2
diff --git a/‎onnxruntime/core/graph/contrib_ops/contrib_defs.cc
+8-165 b/‎onnxruntime/core/graph/contrib_ops/contrib_defs.cc
+8-165
@@ -16,15 +16,15 @@ ONNX_CPU_OPERATOR_TYPED_NCHWC_KERNEL(
     float,
     KernelDefBuilder()
         .TypeConstraint("T", DataTypeImpl::GetTensorType<float>()),
-    ReorderInput<float>);
+    ReorderInput);
 
 ONNX_CPU_OPERATOR_TYPED_NCHWC_KERNEL(
     ReorderOutput,
     1,
     float,
     KernelDefBuilder()
         .TypeConstraint("T", DataTypeImpl::GetTensorType<float>()),
-    ReorderOutput<float>);
+    ReorderOutput);
 
 ONNX_CPU_OPERATOR_TYPED_NCHWC_KERNEL(
     Conv,
@@ -67,27 +67,41 @@ ONNX_CPU_OPERATOR_TYPED_NCHWC_KERNEL(
         .TypeConstraint("T", DataTypeImpl::GetTensorType<float>()),
     NchwcAveragePool);
 
-template <typename T>
-Status ReorderInput<T>::Compute(OpKernelContext* context) const {
+Status ReorderInput::Compute(OpKernelContext* context) const {
   const auto* X = context->Input<Tensor>(0);
   const auto& X_shape = X->Shape();
   ORT_ENFORCE(X_shape.NumDimensions() == 4);
   ORT_ENFORCE((X_shape[1] % MlasNchwcGetBlockSize()) == 0);
   auto* Y = context->Output(0, X_shape);
-  MlasReorderInput(X_shape.GetDims().data(), X->template Data<T>(), Y->template MutableData<T>());
+  MlasReorderInput(X_shape.GetDims().data(), X->template Data<float>(), Y->template MutableData<float>());
   return Status::OK();
 }
 
-template <typename T>
-Status ReorderOutput<T>::Compute(OpKernelContext* context) const {
+Status ReorderOutput::Compute(OpKernelContext* context) const {
   const auto* X = context->Input<Tensor>(0);
   const auto& X_shape = X->Shape();
-  ORT_ENFORCE(X_shape.NumDimensions() == 4);
-  std::vector<int64_t> Y_shape(X_shape.GetDims());
-  ORT_ENFORCE(channels_ <= Y_shape[1]);
-  Y_shape[1] = channels_;
+  const auto X_rank = X_shape.NumDimensions();
+  ORT_ENFORCE(X_rank == 4);
+  ORT_ENFORCE(channels_ <= X_shape[1]);
+
+  // Build the output shape in NCHW or NHWC order.
+  std::vector<int64_t> Y_shape(X_rank);
+  Y_shape[0] = X_shape[0];
+  Y_shape[channels_last_ ? X_rank - 1 : 1] = channels_;
+  auto* Y_spatial_dims = Y_shape.data() + (channels_last_ ? 1 : 2);
+  for (size_t i = 0; i < X_rank - 2; i++) {
+    Y_spatial_dims[i] = X_shape[2 + i];
+  }
   auto* Y = context->Output(0, Y_shape);
-  MlasReorderOutput(Y_shape.data(), X->template Data<T>(), Y->template MutableData<T>());
+
+  const auto* x_data = X->template Data<float>();
+  auto* y_data = Y->template MutableData<float>();
+  if (channels_last_) {
+    MlasReorderOutputNhwc(Y_shape.data(), x_data, y_data);
+  } else {
+    MlasReorderOutputNchw(Y_shape.data(), x_data, y_data);
+  }
+
   return Status::OK();
 }
 
 
@@ -12,7 +12,6 @@
 namespace onnxruntime {
 namespace contrib {
 
-template <typename T>
 class ReorderInput : public OpKernel {
  public:
   ReorderInput(const OpKernelInfo& info) : OpKernel(info) {
@@ -21,18 +20,19 @@ class ReorderInput : public OpKernel {
   Status Compute(OpKernelContext* context) const override;
 };
 
-template <typename T>
 class ReorderOutput : public OpKernel {
  public:
   ReorderOutput(const OpKernelInfo& info) : OpKernel(info) {
     ORT_ENFORCE(info.GetAttr<int64_t>("channels", &channels_).IsOK());
     ORT_ENFORCE(channels_ > 0, "invalid channel count");
+    ORT_ENFORCE(info.GetAttr<int64_t>("channels_last", &channels_last_).IsOK());
   }
 
   Status Compute(OpKernelContext* context) const override;
 
  private:
   int64_t channels_;
+  int64_t channels_last_;
 };
 
 class NchwcConv : public OpKernel {
 
@@ -5,6 +5,7 @@
 #include "core/graph/constants.h"
 #include "core/graph/contrib_ops/attn_lstm_schema_defs.h"
 #include "core/graph/contrib_ops/contrib_defs.h"
+#include "core/graph/contrib_ops/nchwc_schema_defs.h"
 #include "core/graph/contrib_ops/range_schema_defs.h"
 #include "core/graph/op.h"
 #include "onnx/defs/schema.h"
@@ -18,7 +19,6 @@ void convPoolShapeInference(
     bool use_dilation, bool require_kernel_shape,
     int input1Idx,
     int input2Idx);
-void globalPoolTypeShapeInference(ONNX_NAMESPACE::InferenceContext& ctx);
 void matmulShapeInference(
     ONNX_NAMESPACE::InferenceContext& ctx,
     int input1Idx,
@@ -166,37 +166,6 @@ using ONNX_NAMESPACE::AttributeProto;
 using ONNX_NAMESPACE::OpSchema;
 using ONNX_NAMESPACE::OPTIONAL;
 
-void NchwcPoolOpSchemaGenerator(OpSchema& schema) {
-  schema.SetDomain(kMSNchwcDomain);
-  schema.SinceVersion(1);
-  schema.SetDoc(R"DOC(For internal use.)DOC");
-  schema.Attr("auto_pad", "", AttributeProto::STRING, std::string("NOTSET"));
-  schema.Attr("kernel_shape", "", AttributeProto::INTS);
-  schema.Attr("dilations", "", AttributeProto::INTS, OPTIONAL);
-  schema.Attr("strides", "", AttributeProto::INTS, OPTIONAL);
-  schema.Attr("pads", "", AttributeProto::INTS, OPTIONAL);
-  schema.Attr("ceil_mode", "", AttributeProto::INT, static_cast<int64_t>(0));
-  schema.Input(0, "X", "", "T");
-  schema.Output(0, "Y", "", "T");
-  schema.TypeConstraint("T", {"tensor(float)"}, "Constrain input and output types to float tensors");
-  schema.TypeAndShapeInferenceFunction([](ONNX_NAMESPACE::InferenceContext& ctx) {
-    ONNX_NAMESPACE::propagateElemTypeFromInputToOutput(ctx, 0, 0);
-    ONNX_NAMESPACE::convPoolShapeInference(ctx, true, true, 0, 1);
-  });
-}
-
-void NchwcGlobalPoolOpSchemaGenerator(OpSchema& schema) {
-  schema.SetDomain(kMSNchwcDomain);
-  schema.SinceVersion(1);
-  schema.SetDoc(R"DOC(For internal use.)DOC");
-  schema.Input(0, "X", "", "T");
-  schema.Output(0, "Y", "", "T");
-  schema.TypeConstraint("T", {"tensor(float)"}, "Constrain input and output types to float tensors");
-  schema.TypeAndShapeInferenceFunction([](ONNX_NAMESPACE::InferenceContext& ctx) {
-    ONNX_NAMESPACE::globalPoolTypeShapeInference(ctx);
-  });
-}
-
 void ValidateTypeAndShapeForScaleAndZP(ONNX_NAMESPACE::InferenceContext& ctx, int index, ::google::protobuf::int32 expectedType, bool isScalar, int expectedTensorSize = 0) {
   if (ctx.getNumInputs() > static_cast<size_t>(index)) {
     auto data_type = ctx.getInputType(index);
@@ -320,132 +289,6 @@ const char* contrib_ops_auto_pad_doc =
     "In case of odd number add the extra padding at the end for SAME_UPPER and at the "
     "beginning for SAME_LOWER. VALID mean no padding.";
 
-void RegisterNchwcSchemas() {
-  ONNX_CONTRIB_OPERATOR_SCHEMA(ReorderInput)
-      .SetDomain(kMSNchwcDomain)
-      .SinceVersion(1)
-      .SetDoc(R"DOC(For internal use.)DOC")
-      .Input(0, "X", "", "T")
-      .Output(0, "Y", "", "T")
-      .TypeConstraint(
-          "T",
-          {"tensor(float)", "tensor(int8)", "tensor(uint8)"},
-          "Constrain input and output types to float/quantized tensors")
-      .TypeAndShapeInferenceFunction(ONNX_NAMESPACE::propagateShapeAndTypeFromFirstInput);
-
-  ONNX_CONTRIB_OPERATOR_SCHEMA(ReorderOutput)
-      .SetDomain(kMSNchwcDomain)
-      .SinceVersion(1)
-      .SetDoc(R"DOC(For internal use.)DOC")
-      .Attr(
-          "channels",
-          "",
-          AttributeProto::INT,
-          static_cast<int64_t>(0))
-      .Input(0, "X", "", "T")
-      .Output(0, "Y", "", "T")
-      .TypeConstraint(
-          "T",
-          {"tensor(float)", "tensor(int8)", "tensor(uint8)"},
-          "Constrain input and output types to float/quantized tensors")
-      .TypeAndShapeInferenceFunction([](ONNX_NAMESPACE::InferenceContext& ctx) {
-        propagateElemTypeFromInputToOutput(ctx, 0, 0);
-        if (!hasNInputShapes(ctx, 1)) {
-          return;
-        }
-        propagateShapeFromInputToOutput(ctx, 0, 0);
-
-        // Update the output shape with the actual number of channels.
-        auto channels = getAttribute(ctx, "channels", 0);
-        if (channels <= 0) {
-          fail_shape_inference("invalid channel count");
-        }
-        auto output_shape = ctx.getOutputType(0)->mutable_tensor_type()->mutable_shape();
-        if (output_shape->dim_size() < 2) {
-          fail_shape_inference("tensor rank too small");
-        }
-        auto* channels_dim = output_shape->mutable_dim(1);
-        channels_dim->clear_dim_param();
-        channels_dim->set_dim_value(channels);
-      });
-
-  ONNX_CONTRIB_OPERATOR_SCHEMA(Conv)
-      .SetDomain(kMSNchwcDomain)
-      .SinceVersion(1)
-      .SetDoc(R"DOC(For internal use.)DOC")
-      .Attr(
-          "auto_pad",
-          "",
-          AttributeProto::STRING,
-          std::string("NOTSET"))
-      .Attr(
-          "kernel_shape",
-          "",
-          AttributeProto::INTS,
-          OPTIONAL)
-      .Attr(
-          "dilations",
-          "",
-          AttributeProto::INTS,
-          OPTIONAL)
-      .Attr(
-          "strides",
-          "",
-          AttributeProto::INTS,
-          OPTIONAL)
-      .Attr(
-          "pads",
-          "",
-          AttributeProto::INTS, OPTIONAL)
-      .Attr(
-          "group",
-          "",
-          AttributeProto::INT,
-          static_cast<int64_t>(1))
-      .Attr(
-          "activation",
-          "",
-          AttributeProto::STRING,
-          OPTIONAL)
-      .Attr(
-          "activation_params",
-          "",
-          AttributeProto::FLOATS,
-          OPTIONAL)
-      .Input(0, "X", "", "T")
-      .Input(1, "W", "", "T")
-      .Input(2, "B", "", "T", OpSchema::Optional)
-      .Input(3, "Sum", "", "T", OpSchema::Optional)
-      .Output(0, "Y", "", "T")
-      .TypeConstraint("T", {"tensor(float)"}, "Constrain input and output types to float tensors")
-      .TypeAndShapeInferenceFunction([](ONNX_NAMESPACE::InferenceContext& ctx) {
-        ONNX_NAMESPACE::propagateElemTypeFromInputToOutput(ctx, 0, 0);
-        ONNX_NAMESPACE::convPoolShapeInference(ctx, true, false, 0, 1);
-      });
-
-  ONNX_CONTRIB_OPERATOR_SCHEMA(MaxPool)
-      .FillUsing(NchwcPoolOpSchemaGenerator)
-      .Attr(
-          "storage_order",
-          "",
-          AttributeProto::INT,
-          static_cast<int64_t>(0));
-
-  ONNX_CONTRIB_OPERATOR_SCHEMA(AveragePool)
-      .FillUsing(NchwcPoolOpSchemaGenerator)
-      .Attr(
-          "count_include_pad",
-          "",
-          AttributeProto::INT,
-          static_cast<int64_t>(0));
-
-  ONNX_CONTRIB_OPERATOR_SCHEMA(GlobalMaxPool)
-      .FillUsing(NchwcGlobalPoolOpSchemaGenerator);
-
-  ONNX_CONTRIB_OPERATOR_SCHEMA(GlobalAveragePool)
-      .FillUsing(NchwcGlobalPoolOpSchemaGenerator);
-}
-
 void RegisterBertSchemas() {
   ONNX_CONTRIB_OPERATOR_SCHEMA(Attention)
       .SetDomain(kMSDomain)
@@ -1383,8 +1226,8 @@ activation and leaky_relu_alpha.)DOC")
   ONNX_CONTRIB_OPERATOR_SCHEMA_ELSEWHERE(Range, RegisterRangeOpSchema);
 
   static const char* QuantizeLinear_ver1_doc = R"DOC(
-The linear quantization operator. It consumes a full precision data, a scale, a zero point and computes the quantized data. 
-The quantization formula is y = (x / y_scale) + y_zero_point. For (x / y_scale), it computes the nearest integer value to arg (in floating-point format), 
+The linear quantization operator. It consumes a full precision data, a scale, a zero point and computes the quantized data.
+The quantization formula is y = (x / y_scale) + y_zero_point. For (x / y_scale), it computes the nearest integer value to arg (in floating-point format),
  rounding halfway cases away from zero. Scale and zero point must have same shape. They must be either scalar (per tensor) or 1-D tensor (per 'axis').)DOC";
 
   ONNX_CONTRIB_OPERATOR_SCHEMA(QuantizeLinear)
@@ -1440,8 +1283,8 @@ The quantization formula is y = (x / y_scale) + y_zero_point. For (x / y_scale),
       });
 
   static const char* DequantizeLinear_ver1_doc = R"DOC(
-The linear dequantization operator. It consumes a quantized data, a scale, a zero point and computes the full precision data. 
-The dequantization formula is y = (x - x_zero_point) * x_scale. 
+The linear dequantization operator. It consumes a quantized data, a scale, a zero point and computes the full precision data.
+The dequantization formula is y = (x - x_zero_point) * x_scale.
 Scale and zero point must have same shape. They must be either scalar (per tensor) or 1-D tensor (per 'axis').)DOC";
 
   ONNX_CONTRIB_OPERATOR_SCHEMA(DequantizeLinear)
@@ -1682,7 +1525,7 @@ Computes the mean of the low-precision input tensor's element along the provided
 The resulting tensor has the same rank as the input if keepdims equal 1. If keepdims equal 0,
 then the resulting tensor have the reduced dimension pruned. The above behavior is similar to numpy,
 with the exception that numpy default keepdims to False instead of True.
-Input and Output scales and zero points are used to requantize the output in a new range. 
+Input and Output scales and zero points are used to requantize the output in a new range.
 This helps to improve accuracy as after ReduceMean operation the range of the output is expected to decrease.
 
 ```
@@ -1861,7 +1704,7 @@ C (int32) = (A - A_zero_point) * (B - B_zero_point)
  ```
  pad_shape[i] = (output_spatial_shape[i] - 1) * strides_spatial_shape[i] + kernel_spatial_shape[i] - input_spatial_shape[i]
  ```
- 
+
 The output of each pooling window is divided by the number of elements (exclude pad when attribute count_include_pad is zero).
 
 Input and output scales and zero points are used to convert the output to a new quantization range.
@@ -2448,7 +2291,7 @@ Example 4:
       R"DOC(Gaussian Error Linear Unit.
 A high-performing neural network activation function.The GELU nonlinearity is
 the expected transformation of a stochastic regularizer which randomly applies
-the identity or zero map to a neuron's input. The GELU nonlinearity weights 
+the identity or zero map to a neuron's input. The GELU nonlinearity weights
 inputs by their magnitude, rather than gates inputs by their sign as in ReLUs.)DOC";
 
   ONNX_CONTRIB_OPERATOR_SCHEMA(Gelu)