Save prediction only

2026-01-26 15:29:46 +00:00 · 2024-02-02 19:46:47 +08:00
parent 308b513f52
commit e911178017
7 changed files with 48 additions and 27 deletions
--- a/README.md
+++ b/README.md
@@ -82,7 +82,7 @@ encoder = 'vits' # can also be 'vitb' or 'vitl'
 depth_anything = DepthAnything.from_pretrained('LiheYoung/depth_anything_{:}14'.format(encoder))
 ```

-Depth Anything is also supported in ``transformers``. You can use it for depth prediction within [3 lines of code](https://huggingface.co/docs/transformers/main/model_doc/depth_anything) (credit to [@niels](https://huggingface.co/nielsr)).
+Depth Anything is also supported in [``transformers``](https://github.com/huggingface/transformers). You can use it for depth prediction within [3 lines of code](https://huggingface.co/docs/transformers/main/model_doc/depth_anything) (credit to [@niels](https://huggingface.co/nielsr)).

 ### No network connection, cannot load these models?

@@ -115,9 +115,12 @@ pip install -r requirements.txt
 ### Running

 ```bash
-python run.py --encoder <vits | vitb | vitl> --img-path <img-directory | single-img | txt-file> --outdir <outdir>
+python run.py --encoder <vits | vitb | vitl> --img-path <img-directory | single-img | txt-file> --outdir <outdir> [--pred-only] [--grayscale]
 ```
-For the ``img-path``, you can either 1) point it to an image directory storing all interested images, 2) point it to a single image, or 3) point it to a text file storing all image paths.
+Arguments:
+- ``--img-path``: you can either 1) point it to an image directory storing all interested images, 2) point it to a single image, or 3) point it to a text file storing all image paths.
+- ``--pred-only`` is set to save the predicted depth map only. Without it, by default, we visualize both image and its depth map side by side.
+- ``--grayscale`` is set to save the grayscale depth map. Without it, by default, we apply a color palette to the depth map.

 For example:
 ```bash
@@ -182,8 +185,12 @@ depth = depth_anything(image)

 ### Do not want to define image pre-processing or download model definition files?

-Easily use Depth Anything through ``transformers`` within 3 lines of code! Please refer to [these instructions](https://huggingface.co/docs/transformers/main/model_doc/depth_anything) (credit to [@niels](https://huggingface.co/nielsr)).
+Easily use Depth Anything through [``transformers``](https://github.com/huggingface/transformers) within 3 lines of code! Please refer to [these instructions](https://huggingface.co/docs/transformers/main/model_doc/depth_anything) (credit to [@niels](https://huggingface.co/nielsr)).

+**Note:** If you encounter ``KeyError: 'depth_anything'``, please install the latest ``transformers`` from source:
+```bash
+pip install git+https://github.com/huggingface/transformers.git
+```
 <details>
 <summary>Click here for a brief demo:</summary>

--- a/run.py
+++ b/run.py
@@ -17,6 +17,9 @@ if __name__ == '__main__':
    parser.add_argument('--outdir', type=str, default='./vis_depth')
    parser.add_argument('--encoder', type=str, default='vitl', choices=['vits', 'vitb', 'vitl'])
    
+    parser.add_argument('--pred-only', dest='pred_only', action='store_true', help='only display the prediction')
+    parser.add_argument('--grayscale', dest='grayscale', action='store_true', help='do not apply colorful palette')
+    
    args = parser.parse_args()
    
    margin_width = 50
@@ -76,25 +79,35 @@ if __name__ == '__main__':
        depth = (depth - depth.min()) / (depth.max() - depth.min()) * 255.0
        
        depth = depth.cpu().numpy().astype(np.uint8)
-        depth_color = cv2.applyColorMap(depth, cv2.COLORMAP_INFERNO)
        
-        split_region = np.ones((raw_image.shape[0], margin_width, 3), dtype=np.uint8) * 255
-        combined_results = cv2.hconcat([raw_image, split_region, depth_color])
-        
-        caption_space = np.ones((caption_height, combined_results.shape[1], 3), dtype=np.uint8) * 255
-        captions = ['Raw image', 'Depth Anything']
-        segment_width = w + margin_width
-        for i, caption in enumerate(captions):
-            # Calculate text size
-            text_size = cv2.getTextSize(caption, font, font_scale, font_thickness)[0]
-
-            # Calculate x-coordinate to center the text
-            text_x = int((segment_width * i) + (w - text_size[0]) / 2)
-
-            # Add text caption
-            cv2.putText(caption_space, caption, (text_x, 40), font, font_scale, (0, 0, 0), font_thickness)
-        
-        final_result = cv2.vconcat([caption_space, combined_results])
+        if args.grayscale:
+            depth = np.repeat(depth[..., np.newaxis], 3, axis=-1)
+        else:
+            depth = cv2.applyColorMap(depth, cv2.COLORMAP_INFERNO)
        
        filename = os.path.basename(filename)
-        cv2.imwrite(os.path.join(args.outdir, filename[:filename.rfind('.')] + '_img_depth.png'), final_result)
+        
+        if args.pred_only:
+            cv2.imwrite(os.path.join(args.outdir, filename[:filename.rfind('.')] + '_depth.png'), depth)
+        else:
+            split_region = np.ones((raw_image.shape[0], margin_width, 3), dtype=np.uint8) * 255
+            combined_results = cv2.hconcat([raw_image, split_region, depth])
+            
+            caption_space = np.ones((caption_height, combined_results.shape[1], 3), dtype=np.uint8) * 255
+            captions = ['Raw image', 'Depth Anything']
+            segment_width = w + margin_width
+            
+            for i, caption in enumerate(captions):
+                # Calculate text size
+                text_size = cv2.getTextSize(caption, font, font_scale, font_thickness)[0]
+
+                # Calculate x-coordinate to center the text
+                text_x = int((segment_width * i) + (w - text_size[0]) / 2)
+
+                # Add text caption
+                cv2.putText(caption_space, caption, (text_x, 40), font, font_scale, (0, 0, 0), font_thickness)
+            
+            final_result = cv2.vconcat([caption_space, combined_results])
+            
+            cv2.imwrite(os.path.join(args.outdir, filename[:filename.rfind('.')] + '_img_depth.png'), final_result)
+        
--- a/semseg/README.md
+++ b/semseg/README.md
@@ -37,6 +37,7 @@ Note that our results are obtained *without* Mapillary pre-training.
 - [Cityscapes-ViT-L-mIoU-86.4](https://huggingface.co/spaces/LiheYoung/Depth-Anything/blob/main/checkpoints_semseg/cityscapes_vitl_mIoU_86.4.pth)
 - [ADE20K-ViT-L-mIoU-59.4](https://huggingface.co/spaces/LiheYoung/Depth-Anything/blob/main/checkpoints_semseg/ade20k_vitl_mIoU_59.4.pth)

+**Note:** If you want to reproduce the training process, please 1) download the [Depth Anything pre-trained model](https://huggingface.co/spaces/LiheYoung/Depth-Anything/blob/main/checkpoints/depth_anything_vitl14.pth) (to initialize the encoder) and 2) put it under the ``checkpoints`` folder.


 ## Installation
--- a/semseg/config/depth_anything/depth_anything_large_mask2former_16xb1_160k_ade20k_896x896.py
+++ b/semseg/config/depth_anything/depth_anything_large_mask2former_16xb1_160k_ade20k_896x896.py
@@ -20,7 +20,7 @@ model = dict(
        type='DINOv2',
        version='large',
        freeze=False,
-        load_from='../checkpoints/depth_anything_vitl14.pth'),
+        load_from='./checkpoints/depth_anything_vitl14.pth'),
    neck=dict(type='Feature2Pyramid', embed_dim=1024, rescales=[4, 2, 1, 0.5]),
    decode_head=dict(
        type='Mask2FormerHead',
--- a/semseg/config/depth_anything/depth_anything_large_mask2former_16xb1_80k_cityscapes_896x896.py
+++ b/semseg/config/depth_anything/depth_anything_large_mask2former_16xb1_80k_cityscapes_896x896.py
@@ -20,7 +20,7 @@ model = dict(
        type='DINOv2',
        version='large',
        freeze=False,
-        load_from='../checkpoints/depth_anything_vitl14.pth'),
+        load_from='./checkpoints/depth_anything_vitl14.pth'),
    neck=dict(type='Feature2Pyramid', embed_dim=1024, rescales=[4, 2, 1, 0.5]),
    decode_head=dict(
        type='Mask2FormerHead',
--- a/semseg/config/depth_anything/depth_anything_large_mask2former_16xb1_80k_cityscapes_896x896_ms.py
+++ b/semseg/config/depth_anything/depth_anything_large_mask2former_16xb1_80k_cityscapes_896x896_ms.py
@@ -21,7 +21,7 @@ model = dict(
        type='DINOv2',
        version='large',
        freeze=False,
-        load_from='../checkpoints/depth_anything_vitl14.pth'),
+        load_from='./checkpoints/depth_anything_vitl14.pth'),
    neck=dict(type='Feature2Pyramid', embed_dim=1024, rescales=[4, 2, 1, 0.5]),
    decode_head=dict(
        type='Mask2FormerHead',
--- a/semseg/dinov2.py
+++ b/semseg/dinov2.py
@@ -15,7 +15,7 @@ class DINOv2(nn.Module):
        super().__init__()
        
        if version == 'large':
-            self.dinov2 = torch.hub.load('torchhub/facebookresearch_dinov2_main', 'dinov2_vit14', source='local', pretrained=False)
+            self.dinov2 = torch.hub.load('torchhub/facebookresearch_dinov2_main', 'dinov2_vitl14', source='local', pretrained=False)
        else:
            raise NotImplementedError