IDEA-Research
diff --git a/‎LICENSE b/‎LICENSE
diff --git a/‎asset/demo.jpg
80.7 KB b/‎asset/demo.jpg
80.7 KB
diff --git a/‎asset/demo2.jpeg
59.6 KB b/‎asset/demo2.jpeg
59.6 KB
diff --git a/‎asset/demo3.jpeg
96.6 KB b/‎asset/demo3.jpeg
96.6 KB
diff --git a/‎asset/demo4.jpeg
417 KB b/‎asset/demo4.jpeg
417 KB
diff --git a/‎asset/demo5.jpeg
599 KB b/‎asset/demo5.jpeg
599 KB
diff --git a/‎asset/demo_output.jpg
85.6 KB b/‎asset/demo_output.jpg
85.6 KB
diff --git a/‎demo/demo.py
+28-53 b/‎demo/demo.py
+28-53
diff --git a/‎gdino/__init__.py
+4 b/‎gdino/__init__.py
+4
diff --git a/‎gdino/__pycache__/visualize.cpython-38.pyc
3.09 KB b/‎gdino/__pycache__/visualize.cpython-38.pyc
3.09 KB
diff --git a/‎gdino/model_wrapper.py
+132 b/‎gdino/model_wrapper.py
+132
diff --git a/‎gdino/version.py
+1 b/‎gdino/version.py
+1
diff --git a/‎gdino/visualize.py
+108 b/‎gdino/visualize.py
+108
@@ -1,54 +1,29 @@
+import argparse
 import os
-
-from dds_cloudapi_sdk import Config
-from dds_cloudapi_sdk import Client
-from dds_cloudapi_sdk import DetectionTask
-from dds_cloudapi_sdk import TextPrompt
-from dds_cloudapi_sdk import DetectionModel
-from dds_cloudapi_sdk import DetectionTarget
-
-# Step 1: initialize the config
-token = os.getenv("DDS_API")
-config = Config(token)
-
-# Step 2: initialize the client
-client = Client(config)
-
-# Step 3: run the task by DetectionTask class
-image_url = "https://algosplt.oss-cn-shenzhen.aliyuncs.com/test_files/tasks/detection/iron_man.jpg"
-# if you are processing local image file, upload them to DDS server to get the image url
-# image_url = client.upload_file("/path/to/your/prompt/image.png")
-
-task = DetectionTask(
-    image_url=image_url,
-    prompts=[TextPrompt(text="iron man")],
-    targets=[DetectionTarget.Mask, DetectionTarget.BBox],  # detect both bbox and mask
-    model=DetectionModel.GDino1_5_Pro,  # detect with GroundingDino-1.5-Pro model.
-    # Available models: []
-)
-
-client.run_task(task)
-result = task.result
-
-print(result.mask_url)
-
-objects = result.objects  # the list of detected objects
-for idx, obj in enumerate(objects):
-    print(obj.score)  # 0.42
-
-    print(obj.category)  # "iron man"
-
-    print(obj.bbox)  # [635.0, 458.0, 704.0, 508.0]
-
-    print(
-        obj.mask.counts
-    )  # RLE compressed to string, ]o`f08fa14M3L2O2M2O1O1O1O1N2O1N2O1N2N3M2O3L3M3N2M2N3N1N2O...
-
-    # convert the RLE format to RGBA image
-    mask_image = task.rle2rgba(obj.mask)
-    print(mask_image.size)  # (1600, 1170)
-
-    # save the image to file
-    mask_image.save(f"mask_{idx}.png")
-
-    break
+from gdino import GroundingDINOAPIWrapper, visualize
+from PIL import Image
+import numpy as np
+
+
+def get_args():
+    parser = argparse.ArgumentParser(description="Interactive Inference")
+    parser.add_argument(
+        "--token",
+        type=str,
+        help="The token for T-Rex2 API. We are now opening free API access to T-Rex2",
+    )
+    parser.add_argument(
+        "--box_threshold", type=float, default=0.3, help="The threshold for box score"
+    )
+    return parser.parse_args()
+
+if __name__ == "__main__":
+    args = get_args()
+    gdino = GroundingDINOAPIWrapper(args.token)
+    prompts = dict(image='asset/demo.jpg', prompt='person.pigeon.tree')
+    results = gdino.inference(prompts)
+    # now visualize the results
+    image_pil = Image.open(prompts['image'])
+    image_pil = visualize(image_pil, results)
+    # dump the image to the disk
+    image_pil.save('asset/demo_output.jpg')
@@ -0,0 +1,4 @@
+from .model_wrapper import GroundingDINOAPIWrapper
+from .visualize import visualize
+
+__all__ = ["GroundingDINOAPIWrapper", "visualize"]
@@ -0,0 +1,132 @@
+import tempfile
+from typing import Dict, List, Union
+import numpy as np
+from dds_cloudapi_sdk import (
+    DetectionTask,
+    Client,
+    Config,
+    TextPrompt,
+    DetectionModel,
+    DetectionTarget,
+)
+from PIL import Image
+import concurrent.futures
+
+class GroundingDINOAPIWrapper:
+    """API wrapper for Grounding DINO 1.5
+
+    Args:
+        token (str): The token for Grounding DINO 1.5 API. We are now opening free API access to Grounding DINO 1.5. For
+            educators, students, and researchers, we offer an API with extensive usage times to
+            support your educational and research endeavors. You can get free API token at here:
+            https://deepdataspace.com/request_api
+
+    """
+
+    def __init__(self, token: str):
+        self.client = Client(Config(token=token))
+
+    def inference(self, prompt: Dict, return_mask:bool=False):
+        """Main inference function of Grounding DINO 1.5. We take batch as input and
+        each image is a dict. N. We do not support batch inference for now.
+
+        Args:
+            prompts (dict): Annotations with the following keys:
+                - "image" (str): Path to image. E.g. "test1.jpg",
+                - "prompt" (str): Text prompt sepearted by '.' E.g. 'cate1 . cate2 . cate3'
+            return_mask (bool): Whether to return mask. Defaults to False.
+
+        Returns:
+            (Dict): Detection results in dict format with keys::
+                - "scores": (List[float]): A list of scores for each object in the batch
+                - "labels": (List[int]): A list of labels for each object in the batch
+                - "boxes": (List[List[int]]): A list of boxes for each object in the batch,
+                     in format [xmin, ymin, xmax, ymax]
+                - "masks": (List[np.ndarray]): A list of segmentations for each object in the batch
+        """
+        # construct input prompts
+        image=self.get_image_url(prompt["image"]),
+        task=DetectionTask(
+            image_url=image[0],
+            prompts=[TextPrompt(text=prompt['prompt'])],
+            targets=[DetectionTarget.Mask, DetectionTarget.BBox] if return_mask else [DetectionTarget.BBox],
+            model=DetectionModel.GDino1_5_Pro,
+        )
+        self.client.run_task(task)
+        result = task.result
+        return self.postprocess(result, task, return_mask)
+
+
+    def postprocess(self, result, task, return_mask):
+        """Postprocess the result from the API call
+
+        Args:
+            result (TaskResult): Task result with the following keys:
+                - objects (List[DetectionObject]): Each DetectionObject has the following keys:
+                    - bbox (List[float]): Box in xyxy format
+                    - category (str): Detection category
+                    - score (float): Detection score
+                    - mask (DetectionObjectMask): Use mask.counts to parse RLE mask 
+            task (DetectionTask): The task object
+            return_mask (bool): Whether to return mask
+
+        Returns:
+            (Dict): Return dict in format:
+                {
+                    "scores": (List[float]): A list of scores for each object
+                    "categorys": (List[str]): A list of categorys for each object
+                    "boxes": (List[List[int]]): A list of boxes for each object
+                    "masks": (List[PIL.Image]): A list of masks in the format of PIL.Image
+                }
+        """
+        def process_object_with_mask(object):
+            box = object.bbox
+            score = object.score
+            category = object.category
+            mask = task.rle2rgba(object.mask)
+            return box, score, category, mask
+        
+        def process_object_without_mask(object):
+            box = object.bbox
+            score = object.score
+            category = object.category
+            mask = None
+            return box, score, category, mask
+        
+        boxes, scores, categorys, masks = [], [], [], []
+        with concurrent.futures.ThreadPoolExecutor() as executor:
+            if return_mask:
+                process_object = process_object_with_mask
+            else:
+                process_object = process_object_without_mask
+            futures = [executor.submit(process_object, obj) for obj in result.objects]
+            for future in concurrent.futures.as_completed(futures):
+                box, score, category, mask = future.result()
+                boxes.append(box)
+                scores.append(score)
+                categorys.append(category)
+                if mask is not None:
+                    masks.append(mask)
+
+        return dict(boxes=boxes, categorys=categorys, scores=scores, masks=masks)
+
+    def get_image_url(self, image: Union[str, np.ndarray]):
+        """Upload Image to server and return the url
+
+        Args:
+            image (Union[str, np.ndarray]): The image to upload. Can be a file path or np.ndarray.
+                If it is a np.ndarray, it will be saved to a temporary file.
+
+        Returns:
+            str: The url of the image
+        """
+        if isinstance(image, str):
+            url = self.client.upload_file(image)
+        else:
+            with tempfile.NamedTemporaryFile(delete=True, suffix=".png") as tmp_file:
+                # image is in numpy format, convert to PIL Image
+                image = Image.fromarray(image)
+                image.save(tmp_file, format="PNG")
+                tmp_file_path = tmp_file.name
+                url = self.client.upload_file(tmp_file_path)
+        return url
@@ -0,0 +1 @@
+__version__ = 'v1.5'
@@ -0,0 +1,108 @@
+from typing import Dict
+
+import numpy as np
+from PIL import Image, ImageDraw, ImageFont, ImageOps
+import random
+
+
+def draw_mask(mask, draw, random_color=True):
+    """Draws a mask with a specified color on an image.
+
+    Args:
+        mask (np.array): Binary mask as a NumPy array.
+        draw (ImageDraw.Draw): ImageDraw object to draw on the image.
+        random_color (bool): Whether to use a random color for the mask.
+    """
+    if random_color:
+        color = (
+            random.randint(0, 255),
+            random.randint(0, 255),
+            random.randint(0, 255),
+            153,
+        )
+    else:
+        color = (30, 144, 255, 153)
+
+    nonzero_coords = np.transpose(np.nonzero(mask))
+    
+    for coord in nonzero_coords:
+        draw.point(coord[::-1], fill=color)
+
+def visualize(image_pil: Image,
+              result: Dict,
+              draw_width: float = 6.0,
+              return_mask=True,
+              draw_score=True) -> Image:
+    """Plot bounding boxes and labels on an image.
+
+    Args:
+        image_pil (PIL.Image): The input image as a PIL Image object.
+        result (Dict[str, Union[torch.Tensor, List[torch.Tensor]]]): The target dictionary containing
+            the bounding boxes and labels. The keys are:
+                - boxes (List[int]): A list of bounding boxes in shape (N, 4), [x1, y1, x2, y2] format.
+                - scores (List[float]): A list of scores for each bounding box. shape (N)
+                - categorys (List[str]): A list of categorys for each object
+                - masks (List[PIL.Image]): A list of masks in the format of PIL.Image
+        draw_score (bool): Draw score on the image. Defaults to False.
+
+    Returns:
+        PIL.Image: The input image with plotted bounding boxes, labels, and masks.
+    """
+    # Get the bounding boxes and labels from the target dictionary
+    boxes = result["boxes"]
+    scores = result["scores"]
+    categorys = result["categorys"]
+    masks = result.get("masks", [])
+
+    # Find all unique categories and build a cate2color dictionary
+    cate2color = {}
+    unique_categorys = set(categorys)
+    for cate in unique_categorys:
+        cate2color[cate] = tuple(np.random.randint(0, 255, size=3).tolist())
+
+    # Create a PIL ImageDraw object to draw on the input image
+    if isinstance(image_pil, np.ndarray):
+        image_pil = Image.fromarray(image_pil)
+    draw = ImageDraw.Draw(image_pil)
+    
+    # Create a new binary mask image with the same size as the input image
+    mask = Image.new("L", image_pil.size, 0)
+    # Create a PIL ImageDraw object to draw on the mask image
+    mask_draw = ImageDraw.Draw(mask)
+
+    # Draw boxes, labels, and masks for each box and label in the target dictionary
+    for box, score, category in zip(boxes, scores, categorys):
+        # Extract the box coordinates
+        x0, y0, x1, y1 = box
+        x0, y0, x1, y1 = int(x0), int(y0), int(x1), int(y1)
+        color = cate2color[category]
+
+        # Draw the box outline on the input image
+        draw.rectangle([x0, y0, x1, y1], outline=color, width=int(draw_width))
+
+        # Draw the label and score on the input image
+        if draw_score:
+            text = f"{category} {score:.2f}"
+        else:
+            text = f"{category}"
+        
+        font = ImageFont.load_default()
+        if hasattr(font, "getbbox"):
+            bbox = draw.textbbox((x0, y0), text, font)
+        else:
+            w, h = draw.textsize(text, font)
+            bbox = (x0, y0, w + x0, y0 + h)
+        draw.rectangle(bbox, fill=color)
+        draw.text((x0, y0), text, fill="white")
+
+    # Draw the mask on the input image if masks are provided
+    if len(masks) > 0 and return_mask:
+        size = image_pil.size
+        mask_image = Image.new("RGBA", size, color=(0, 0, 0, 0))
+        mask_draw = ImageDraw.Draw(mask_image)
+        for mask in masks:
+            mask = np.array(mask)[:, :, -1]
+            draw_mask(mask, mask_draw)
+
+        image_pil = Image.alpha_composite(image_pil.convert("RGBA"), mask_image).convert("RGB")
+    return image_pil