From a76191ccbecaa00b693de483286cecbddb1f9a96 Mon Sep 17 00:00:00 2001 From: ShawnXuan Date: Thu, 29 May 2025 09:53:41 +0800 Subject: [PATCH 1/8] Add NPU support for nms_op by dispatching to device-specific implementation --- python/oneflow/nn/modules/nms.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/python/oneflow/nn/modules/nms.py b/python/oneflow/nn/modules/nms.py index 7fdb64f0087..96b71ae8754 100644 --- a/python/oneflow/nn/modules/nms.py +++ b/python/oneflow/nn/modules/nms.py @@ -19,6 +19,8 @@ def nms_op(boxes, scores, iou_threshold: float): + if boxes.device == flow.device("npu"): + return flow._C.nms(boxes, scores, iou_threshold) score_inds = flow.argsort(scores, dim=0, descending=True) boxes = flow._C.gather(boxes, score_inds, axis=0) keep = flow._C.nms(boxes, iou_threshold) From 119e3834de7f502bdbdd475006e8cd30e64942b8 Mon Sep 17 00:00:00 2001 From: ShawnXuan Date: Thu, 29 May 2025 10:59:50 +0800 Subject: [PATCH 2/8] fused nms --- oneflow/core/functional/functional_api.yaml | 2 +- oneflow/core/functional/impl/nn_functor.cpp | 18 +++++++++++++++--- oneflow/ir/include/OneFlow/OneFlowUserOps.td | 3 ++- 3 files changed, 18 insertions(+), 5 deletions(-) diff --git a/oneflow/core/functional/functional_api.yaml b/oneflow/core/functional/functional_api.yaml index 5c6d8148ac1..044d4251246 100644 --- a/oneflow/core/functional/functional_api.yaml +++ b/oneflow/core/functional/functional_api.yaml @@ -2789,7 +2789,7 @@ bind_python: False - name: "nms" - signature: "Tensor (Tensor x, Float iou_threshold, Int32 keep_n=-1) => Nms" + signature: "Tensor (Tensor x, Tensor scores=None, Float iou_threshold, Int32 keep_n=-1) => Nms" bind_python: True - name: "roi_align" diff --git a/oneflow/core/functional/impl/nn_functor.cpp b/oneflow/core/functional/impl/nn_functor.cpp index 51f42367c07..92bc40343c7 100644 --- a/oneflow/core/functional/impl/nn_functor.cpp +++ b/oneflow/core/functional/impl/nn_functor.cpp @@ -4014,17 +4014,29 @@ class PariticalFCSampleDisableBoxing { class NmsFunctor { public: - NmsFunctor() { op_ = CHECK_JUST(one::OpBuilder("nms").Input("in").Output("out").Build()); } + NmsFunctor() { + op_ = CHECK_JUST(one::OpBuilder("nms").Input("in").Output("out").Build()); + fused_op_ = CHECK_JUST(one::OpBuilder("nms").Input("in").Input("scores").Output("out").Build()); + } - Maybe operator()(const std::shared_ptr& x, const float& iou_threshold, + Maybe operator()(const std::shared_ptr& x, + const Optional& scores, + const float& iou_threshold, const int32_t& keep_n) const { auto& attrs = THREAD_CACHED_MUTABLE_ATTR_MAP("iou_threshold", "keep_n"); attrs.SetAllAttrs(iou_threshold, keep_n); - return OpInterpUtil::Dispatch(*op_, {x}, attrs); + DeviceType device_type = JUST(x->device())->enum_type(); + if (device_type == DeviceType::kNPU) { + return OpInterpUtil::Dispatch(*fused_op_, {x, JUST(scores)}, attrs); + } + else { + return OpInterpUtil::Dispatch(*op_, {x}, attrs); + } } private: std::shared_ptr op_; + std::shared_ptr fused_op_; }; class RoiAlignFunctor { diff --git a/oneflow/ir/include/OneFlow/OneFlowUserOps.td b/oneflow/ir/include/OneFlow/OneFlowUserOps.td index 7532cfe441e..62f748e2f33 100644 --- a/oneflow/ir/include/OneFlow/OneFlowUserOps.td +++ b/oneflow/ir/include/OneFlow/OneFlowUserOps.td @@ -1886,7 +1886,8 @@ def OneFlow_InTopKOp : OneFlow_BaseOp<"in_top_k", [NoMemoryEffect, NoGrad, Decla def OneFlow_NmsOp : OneFlow_BaseOp<"nms", [NoMemoryEffect, DeclareOpInterfaceMethods]> { let input = (ins - OneFlow_Tensor:$in + OneFlow_Tensor:$in, + Optional:$scores ); let output = (outs OneFlow_Tensor:$out From 670ee0aa482ec8dd09557b838ad9febb722c5f67 Mon Sep 17 00:00:00 2001 From: oneflow-ci-bot Date: Thu, 29 May 2025 03:02:47 +0000 Subject: [PATCH 3/8] auto format by CI --- oneflow/core/functional/impl/nn_functor.cpp | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/oneflow/core/functional/impl/nn_functor.cpp b/oneflow/core/functional/impl/nn_functor.cpp index 92bc40343c7..f85bf719243 100644 --- a/oneflow/core/functional/impl/nn_functor.cpp +++ b/oneflow/core/functional/impl/nn_functor.cpp @@ -4020,16 +4020,14 @@ class NmsFunctor { } Maybe operator()(const std::shared_ptr& x, - const Optional& scores, - const float& iou_threshold, + const Optional& scores, const float& iou_threshold, const int32_t& keep_n) const { auto& attrs = THREAD_CACHED_MUTABLE_ATTR_MAP("iou_threshold", "keep_n"); attrs.SetAllAttrs(iou_threshold, keep_n); DeviceType device_type = JUST(x->device())->enum_type(); if (device_type == DeviceType::kNPU) { return OpInterpUtil::Dispatch(*fused_op_, {x, JUST(scores)}, attrs); - } - else { + } else { return OpInterpUtil::Dispatch(*op_, {x}, attrs); } } From 87969be29d90df5332b0bc9e3902502e46dc26c2 Mon Sep 17 00:00:00 2001 From: ShawnXuan Date: Thu, 29 May 2025 11:19:09 +0800 Subject: [PATCH 4/8] Ensure scores is present or fall back to the non-fused implementation. --- oneflow/core/functional/impl/nn_functor.cpp | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/oneflow/core/functional/impl/nn_functor.cpp b/oneflow/core/functional/impl/nn_functor.cpp index f85bf719243..2411a4ea45e 100644 --- a/oneflow/core/functional/impl/nn_functor.cpp +++ b/oneflow/core/functional/impl/nn_functor.cpp @@ -4026,7 +4026,11 @@ class NmsFunctor { attrs.SetAllAttrs(iou_threshold, keep_n); DeviceType device_type = JUST(x->device())->enum_type(); if (device_type == DeviceType::kNPU) { - return OpInterpUtil::Dispatch(*fused_op_, {x, JUST(scores)}, attrs); + if (scores) { + return OpInterpUtil::Dispatch(*fused_op_, {x, JUST(scores)}, attrs); + } else { + return OpInterpUtil::Dispatch(*op_, {x}, attrs); + } } else { return OpInterpUtil::Dispatch(*op_, {x}, attrs); } From 86a56cca8071af003c827f4bf6d713e7feb824ed Mon Sep 17 00:00:00 2001 From: ShawnXuan Date: Fri, 30 May 2025 14:36:45 +0800 Subject: [PATCH 5/8] update --- python/oneflow/nn/modules/nms.py | 22 +++++++++++++++++++++- 1 file changed, 21 insertions(+), 1 deletion(-) diff --git a/python/oneflow/nn/modules/nms.py b/python/oneflow/nn/modules/nms.py index 96b71ae8754..4398a7a12d8 100644 --- a/python/oneflow/nn/modules/nms.py +++ b/python/oneflow/nn/modules/nms.py @@ -20,9 +20,29 @@ def nms_op(boxes, scores, iou_threshold: float): if boxes.device == flow.device("npu"): + if boxes.ndim == 2 and boxes.shape[-1] == 4: + boxes = boxes.unsqueeze(0) + elif boxes.ndim != 3 or boxes.shape[-1] != 4: + raise ValueError(f"boxes must be of shape [B, N, 4] or [N, 4], but got {boxes.shape}") + + if scores.ndim == 1: + scores = scores.unsqueeze(0).unsqueeze(0) + elif scores.ndim == 2: + scores = scores.unsqueeze(0) + elif scores.ndim != 3: + raise ValueError(f"scores must be of shape [B, C, N], [C, N] or [N], but got {scores.shape}") + + if boxes.shape[0] != scores.shape[0]: + raise ValueError(f"batch_size mismatch: boxes {boxes.shape[0]} vs scores {scores.shape[0]}") + if boxes.shape[1] != scores.shape[2]: + raise ValueError(f"spatial_dimension mismatch: boxes {boxes.shape[1]} vs scores {scores.shape[2]}") + return flow._C.nms(boxes, scores, iou_threshold) + score_inds = flow.argsort(scores, dim=0, descending=True) boxes = flow._C.gather(boxes, score_inds, axis=0) - keep = flow._C.nms(boxes, iou_threshold) + keep = flow._C.nms(boxes, iou_threshold=iou_threshold) + print(keep.shape) index = flow.squeeze(flow.argwhere(keep), dim=[1]) + print(index.shape) return flow._C.gather(score_inds, index, axis=0) From 6d85296b4e4646bd0f0f21d9b578a8c31aac4b6f Mon Sep 17 00:00:00 2001 From: ShawnXuan Date: Sun, 1 Jun 2025 20:48:56 +0800 Subject: [PATCH 6/8] use score_inds --- oneflow/core/functional/functional_api.yaml | 2 +- .../core/functional/impl/array_functor.cpp | 20 ++++++++++++- oneflow/core/functional/impl/nn_functor.cpp | 6 ++-- oneflow/ir/include/OneFlow/OneFlowUserOps.td | 3 +- oneflow/user/ops/nms_op.cpp | 8 ++++- python/oneflow/nn/modules/nms.py | 30 ++++--------------- 6 files changed, 38 insertions(+), 31 deletions(-) diff --git a/oneflow/core/functional/functional_api.yaml b/oneflow/core/functional/functional_api.yaml index 044d4251246..e633b01392a 100644 --- a/oneflow/core/functional/functional_api.yaml +++ b/oneflow/core/functional/functional_api.yaml @@ -2789,7 +2789,7 @@ bind_python: False - name: "nms" - signature: "Tensor (Tensor x, Tensor scores=None, Float iou_threshold, Int32 keep_n=-1) => Nms" + signature: "Tensor (Tensor x, Tensor scores=None, Tensor input_indices=None, Float iou_threshold, Int32 keep_n=-1) => Nms" bind_python: True - name: "roi_align" diff --git a/oneflow/core/functional/impl/array_functor.cpp b/oneflow/core/functional/impl/array_functor.cpp index aef7ef62a3b..4a960cba34d 100644 --- a/oneflow/core/functional/impl/array_functor.cpp +++ b/oneflow/core/functional/impl/array_functor.cpp @@ -588,7 +588,25 @@ class ArgWhereFunctor { const Symbol& dtype) const { auto& attrs = THREAD_CACHED_MUTABLE_ATTR_MAP("dtype"); attrs.SetAllAttrs(dtype->data_type()); - return OpInterpUtil::Dispatch(*op_, {x}, attrs); + + auto device_type = DeviceType::kCPU; + if (x->is_global()) { + device_type = JUST(x->parallel_desc())->device_type(); + } else { + device_type = JUST(x->device())->enum_type(); + } + + if (device_type == DeviceType::kNPU) { + // NOTE: use cpu argwhere when device="npu" + auto cpu_tensor = JUST(one::functional::To(x, "cpu")); + auto result = JUST(OpInterpUtil::Dispatch(*op_, {cpu_tensor}, attrs)); + for (int i = 0; i < result->size(); ++i) { + (*result)[i] = JUST(one::functional::To((*result)[i], "npu")); + } + return result; + } else { + return OpInterpUtil::Dispatch(*op_, {x}, attrs); + } } private: diff --git a/oneflow/core/functional/impl/nn_functor.cpp b/oneflow/core/functional/impl/nn_functor.cpp index 2411a4ea45e..6c9647e01ca 100644 --- a/oneflow/core/functional/impl/nn_functor.cpp +++ b/oneflow/core/functional/impl/nn_functor.cpp @@ -4016,18 +4016,18 @@ class NmsFunctor { public: NmsFunctor() { op_ = CHECK_JUST(one::OpBuilder("nms").Input("in").Output("out").Build()); - fused_op_ = CHECK_JUST(one::OpBuilder("nms").Input("in").Input("scores").Output("out").Build()); + fused_op_ = CHECK_JUST(one::OpBuilder("nms").Input("in").Input("scores").Input("input_indices").Output("out").Build()); } Maybe operator()(const std::shared_ptr& x, - const Optional& scores, const float& iou_threshold, + const Optional& scores, const Optional& input_indices, const float& iou_threshold, const int32_t& keep_n) const { auto& attrs = THREAD_CACHED_MUTABLE_ATTR_MAP("iou_threshold", "keep_n"); attrs.SetAllAttrs(iou_threshold, keep_n); DeviceType device_type = JUST(x->device())->enum_type(); if (device_type == DeviceType::kNPU) { if (scores) { - return OpInterpUtil::Dispatch(*fused_op_, {x, JUST(scores)}, attrs); + return OpInterpUtil::Dispatch(*fused_op_, {x, JUST(scores), JUST(input_indices)}, attrs); } else { return OpInterpUtil::Dispatch(*op_, {x}, attrs); } diff --git a/oneflow/ir/include/OneFlow/OneFlowUserOps.td b/oneflow/ir/include/OneFlow/OneFlowUserOps.td index 62f748e2f33..190536c9ce5 100644 --- a/oneflow/ir/include/OneFlow/OneFlowUserOps.td +++ b/oneflow/ir/include/OneFlow/OneFlowUserOps.td @@ -1887,7 +1887,8 @@ def OneFlow_InTopKOp : OneFlow_BaseOp<"in_top_k", [NoMemoryEffect, NoGrad, Decla def OneFlow_NmsOp : OneFlow_BaseOp<"nms", [NoMemoryEffect, DeclareOpInterfaceMethods]> { let input = (ins OneFlow_Tensor:$in, - Optional:$scores + Optional:$scores, + Optional:$input_indices ); let output = (outs OneFlow_Tensor:$out diff --git a/oneflow/user/ops/nms_op.cpp b/oneflow/user/ops/nms_op.cpp index e49236c7e8c..3f4fb7b31f8 100644 --- a/oneflow/user/ops/nms_op.cpp +++ b/oneflow/user/ops/nms_op.cpp @@ -26,7 +26,13 @@ Maybe InferNmsTensorDesc(user_op::InferContext* ctx) { } Maybe InferNmsDataType(user_op::InferContext* ctx) { - ctx->SetOutputDType("out", 0, DataType::kInt8); + if (ctx->parallel_desc().device_type() == DeviceType::kNPU) { + LOG(ERROR) << "InferNmsDataType npu"; + ctx->SetOutputDType("out", 0, DataType::kInt32); + } else { + LOG(ERROR) << "cpu"; + ctx->SetOutputDType("out", 0, DataType::kInt8); + } return Maybe::Ok(); } diff --git a/python/oneflow/nn/modules/nms.py b/python/oneflow/nn/modules/nms.py index 4398a7a12d8..86400565e10 100644 --- a/python/oneflow/nn/modules/nms.py +++ b/python/oneflow/nn/modules/nms.py @@ -19,30 +19,12 @@ def nms_op(boxes, scores, iou_threshold: float): - if boxes.device == flow.device("npu"): - if boxes.ndim == 2 and boxes.shape[-1] == 4: - boxes = boxes.unsqueeze(0) - elif boxes.ndim != 3 or boxes.shape[-1] != 4: - raise ValueError(f"boxes must be of shape [B, N, 4] or [N, 4], but got {boxes.shape}") - - if scores.ndim == 1: - scores = scores.unsqueeze(0).unsqueeze(0) - elif scores.ndim == 2: - scores = scores.unsqueeze(0) - elif scores.ndim != 3: - raise ValueError(f"scores must be of shape [B, C, N], [C, N] or [N], but got {scores.shape}") - - if boxes.shape[0] != scores.shape[0]: - raise ValueError(f"batch_size mismatch: boxes {boxes.shape[0]} vs scores {scores.shape[0]}") - if boxes.shape[1] != scores.shape[2]: - raise ValueError(f"spatial_dimension mismatch: boxes {boxes.shape[1]} vs scores {scores.shape[2]}") - - return flow._C.nms(boxes, scores, iou_threshold) - score_inds = flow.argsort(scores, dim=0, descending=True) - boxes = flow._C.gather(boxes, score_inds, axis=0) - keep = flow._C.nms(boxes, iou_threshold=iou_threshold) - print(keep.shape) + if boxes.device == flow.device("npu"): + sorted_scores = flow.gather(scores, dim=0, index=score_inds) + keep = flow._C.nms(boxes, sorted_scores, score_inds.to(flow.int32), iou_threshold=iou_threshold) + else: + boxes = flow._C.gather(boxes, score_inds, axis=0) + keep = flow._C.nms(boxes, iou_threshold=iou_threshold) index = flow.squeeze(flow.argwhere(keep), dim=[1]) - print(index.shape) return flow._C.gather(score_inds, index, axis=0) From f171b46b1f83bc236acd192a5e9c06cea5870f0e Mon Sep 17 00:00:00 2001 From: oneflow-ci-bot Date: Sun, 1 Jun 2025 13:02:03 +0000 Subject: [PATCH 7/8] auto format by CI --- oneflow/core/functional/impl/nn_functor.cpp | 13 ++++++++++--- python/oneflow/nn/modules/nms.py | 4 +++- 2 files changed, 13 insertions(+), 4 deletions(-) diff --git a/oneflow/core/functional/impl/nn_functor.cpp b/oneflow/core/functional/impl/nn_functor.cpp index 6c9647e01ca..3861c417a31 100644 --- a/oneflow/core/functional/impl/nn_functor.cpp +++ b/oneflow/core/functional/impl/nn_functor.cpp @@ -4016,18 +4016,25 @@ class NmsFunctor { public: NmsFunctor() { op_ = CHECK_JUST(one::OpBuilder("nms").Input("in").Output("out").Build()); - fused_op_ = CHECK_JUST(one::OpBuilder("nms").Input("in").Input("scores").Input("input_indices").Output("out").Build()); + fused_op_ = CHECK_JUST(one::OpBuilder("nms") + .Input("in") + .Input("scores") + .Input("input_indices") + .Output("out") + .Build()); } Maybe operator()(const std::shared_ptr& x, - const Optional& scores, const Optional& input_indices, const float& iou_threshold, + const Optional& scores, + const Optional& input_indices, const float& iou_threshold, const int32_t& keep_n) const { auto& attrs = THREAD_CACHED_MUTABLE_ATTR_MAP("iou_threshold", "keep_n"); attrs.SetAllAttrs(iou_threshold, keep_n); DeviceType device_type = JUST(x->device())->enum_type(); if (device_type == DeviceType::kNPU) { if (scores) { - return OpInterpUtil::Dispatch(*fused_op_, {x, JUST(scores), JUST(input_indices)}, attrs); + return OpInterpUtil::Dispatch(*fused_op_, {x, JUST(scores), JUST(input_indices)}, + attrs); } else { return OpInterpUtil::Dispatch(*op_, {x}, attrs); } diff --git a/python/oneflow/nn/modules/nms.py b/python/oneflow/nn/modules/nms.py index 86400565e10..f6059d38161 100644 --- a/python/oneflow/nn/modules/nms.py +++ b/python/oneflow/nn/modules/nms.py @@ -22,7 +22,9 @@ def nms_op(boxes, scores, iou_threshold: float): score_inds = flow.argsort(scores, dim=0, descending=True) if boxes.device == flow.device("npu"): sorted_scores = flow.gather(scores, dim=0, index=score_inds) - keep = flow._C.nms(boxes, sorted_scores, score_inds.to(flow.int32), iou_threshold=iou_threshold) + keep = flow._C.nms( + boxes, sorted_scores, score_inds.to(flow.int32), iou_threshold=iou_threshold + ) else: boxes = flow._C.gather(boxes, score_inds, axis=0) keep = flow._C.nms(boxes, iou_threshold=iou_threshold) From 4393f7a133bd5a157284cdefe38a6f2a527b11c2 Mon Sep 17 00:00:00 2001 From: ShawnXuan Date: Sun, 1 Jun 2025 21:10:24 +0800 Subject: [PATCH 8/8] rm LOG(ERROR) --- oneflow/user/ops/nms_op.cpp | 2 -- 1 file changed, 2 deletions(-) diff --git a/oneflow/user/ops/nms_op.cpp b/oneflow/user/ops/nms_op.cpp index 3f4fb7b31f8..56e3883254d 100644 --- a/oneflow/user/ops/nms_op.cpp +++ b/oneflow/user/ops/nms_op.cpp @@ -27,10 +27,8 @@ Maybe InferNmsTensorDesc(user_op::InferContext* ctx) { Maybe InferNmsDataType(user_op::InferContext* ctx) { if (ctx->parallel_desc().device_type() == DeviceType::kNPU) { - LOG(ERROR) << "InferNmsDataType npu"; ctx->SetOutputDType("out", 0, DataType::kInt32); } else { - LOG(ERROR) << "cpu"; ctx->SetOutputDType("out", 0, DataType::kInt8); } return Maybe::Ok();