Save prediction only

This commit is contained in:
Lihe Yang
2024-02-02 19:46:47 +08:00
committed by GitHub
parent 308b513f52
commit e911178017
7 changed files with 48 additions and 27 deletions

View File

@@ -82,7 +82,7 @@ encoder = 'vits' # can also be 'vitb' or 'vitl'
depth_anything = DepthAnything.from_pretrained('LiheYoung/depth_anything_{:}14'.format(encoder))
```
Depth Anything is also supported in ``transformers``. You can use it for depth prediction within [3 lines of code](https://huggingface.co/docs/transformers/main/model_doc/depth_anything) (credit to [@niels](https://huggingface.co/nielsr)).
Depth Anything is also supported in [``transformers``](https://github.com/huggingface/transformers). You can use it for depth prediction within [3 lines of code](https://huggingface.co/docs/transformers/main/model_doc/depth_anything) (credit to [@niels](https://huggingface.co/nielsr)).
### No network connection, cannot load these models?
@@ -115,9 +115,12 @@ pip install -r requirements.txt
### Running
```bash
python run.py --encoder <vits | vitb | vitl> --img-path <img-directory | single-img | txt-file> --outdir <outdir>
python run.py --encoder <vits | vitb | vitl> --img-path <img-directory | single-img | txt-file> --outdir <outdir> [--pred-only] [--grayscale]
```
For the ``img-path``, you can either 1) point it to an image directory storing all interested images, 2) point it to a single image, or 3) point it to a text file storing all image paths.
Arguments:
- ``--img-path``: you can either 1) point it to an image directory storing all interested images, 2) point it to a single image, or 3) point it to a text file storing all image paths.
- ``--pred-only`` is set to save the predicted depth map only. Without it, by default, we visualize both image and its depth map side by side.
- ``--grayscale`` is set to save the grayscale depth map. Without it, by default, we apply a color palette to the depth map.
For example:
```bash
@@ -182,8 +185,12 @@ depth = depth_anything(image)
### Do not want to define image pre-processing or download model definition files?
Easily use Depth Anything through ``transformers`` within 3 lines of code! Please refer to [these instructions](https://huggingface.co/docs/transformers/main/model_doc/depth_anything) (credit to [@niels](https://huggingface.co/nielsr)).
Easily use Depth Anything through [``transformers``](https://github.com/huggingface/transformers) within 3 lines of code! Please refer to [these instructions](https://huggingface.co/docs/transformers/main/model_doc/depth_anything) (credit to [@niels](https://huggingface.co/nielsr)).
**Note:** If you encounter ``KeyError: 'depth_anything'``, please install the latest ``transformers`` from source:
```bash
pip install git+https://github.com/huggingface/transformers.git
```
<details>
<summary>Click here for a brief demo:</summary>

51
run.py
View File

@@ -17,6 +17,9 @@ if __name__ == '__main__':
parser.add_argument('--outdir', type=str, default='./vis_depth')
parser.add_argument('--encoder', type=str, default='vitl', choices=['vits', 'vitb', 'vitl'])
parser.add_argument('--pred-only', dest='pred_only', action='store_true', help='only display the prediction')
parser.add_argument('--grayscale', dest='grayscale', action='store_true', help='do not apply colorful palette')
args = parser.parse_args()
margin_width = 50
@@ -76,25 +79,35 @@ if __name__ == '__main__':
depth = (depth - depth.min()) / (depth.max() - depth.min()) * 255.0
depth = depth.cpu().numpy().astype(np.uint8)
depth_color = cv2.applyColorMap(depth, cv2.COLORMAP_INFERNO)
split_region = np.ones((raw_image.shape[0], margin_width, 3), dtype=np.uint8) * 255
combined_results = cv2.hconcat([raw_image, split_region, depth_color])
caption_space = np.ones((caption_height, combined_results.shape[1], 3), dtype=np.uint8) * 255
captions = ['Raw image', 'Depth Anything']
segment_width = w + margin_width
for i, caption in enumerate(captions):
# Calculate text size
text_size = cv2.getTextSize(caption, font, font_scale, font_thickness)[0]
# Calculate x-coordinate to center the text
text_x = int((segment_width * i) + (w - text_size[0]) / 2)
# Add text caption
cv2.putText(caption_space, caption, (text_x, 40), font, font_scale, (0, 0, 0), font_thickness)
final_result = cv2.vconcat([caption_space, combined_results])
if args.grayscale:
depth = np.repeat(depth[..., np.newaxis], 3, axis=-1)
else:
depth = cv2.applyColorMap(depth, cv2.COLORMAP_INFERNO)
filename = os.path.basename(filename)
cv2.imwrite(os.path.join(args.outdir, filename[:filename.rfind('.')] + '_img_depth.png'), final_result)
if args.pred_only:
cv2.imwrite(os.path.join(args.outdir, filename[:filename.rfind('.')] + '_depth.png'), depth)
else:
split_region = np.ones((raw_image.shape[0], margin_width, 3), dtype=np.uint8) * 255
combined_results = cv2.hconcat([raw_image, split_region, depth])
caption_space = np.ones((caption_height, combined_results.shape[1], 3), dtype=np.uint8) * 255
captions = ['Raw image', 'Depth Anything']
segment_width = w + margin_width
for i, caption in enumerate(captions):
# Calculate text size
text_size = cv2.getTextSize(caption, font, font_scale, font_thickness)[0]
# Calculate x-coordinate to center the text
text_x = int((segment_width * i) + (w - text_size[0]) / 2)
# Add text caption
cv2.putText(caption_space, caption, (text_x, 40), font, font_scale, (0, 0, 0), font_thickness)
final_result = cv2.vconcat([caption_space, combined_results])
cv2.imwrite(os.path.join(args.outdir, filename[:filename.rfind('.')] + '_img_depth.png'), final_result)

View File

@@ -37,6 +37,7 @@ Note that our results are obtained *without* Mapillary pre-training.
- [Cityscapes-ViT-L-mIoU-86.4](https://huggingface.co/spaces/LiheYoung/Depth-Anything/blob/main/checkpoints_semseg/cityscapes_vitl_mIoU_86.4.pth)
- [ADE20K-ViT-L-mIoU-59.4](https://huggingface.co/spaces/LiheYoung/Depth-Anything/blob/main/checkpoints_semseg/ade20k_vitl_mIoU_59.4.pth)
**Note:** If you want to reproduce the training process, please 1) download the [Depth Anything pre-trained model](https://huggingface.co/spaces/LiheYoung/Depth-Anything/blob/main/checkpoints/depth_anything_vitl14.pth) (to initialize the encoder) and 2) put it under the ``checkpoints`` folder.
## Installation

View File

@@ -20,7 +20,7 @@ model = dict(
type='DINOv2',
version='large',
freeze=False,
load_from='../checkpoints/depth_anything_vitl14.pth'),
load_from='./checkpoints/depth_anything_vitl14.pth'),
neck=dict(type='Feature2Pyramid', embed_dim=1024, rescales=[4, 2, 1, 0.5]),
decode_head=dict(
type='Mask2FormerHead',

View File

@@ -20,7 +20,7 @@ model = dict(
type='DINOv2',
version='large',
freeze=False,
load_from='../checkpoints/depth_anything_vitl14.pth'),
load_from='./checkpoints/depth_anything_vitl14.pth'),
neck=dict(type='Feature2Pyramid', embed_dim=1024, rescales=[4, 2, 1, 0.5]),
decode_head=dict(
type='Mask2FormerHead',

View File

@@ -21,7 +21,7 @@ model = dict(
type='DINOv2',
version='large',
freeze=False,
load_from='../checkpoints/depth_anything_vitl14.pth'),
load_from='./checkpoints/depth_anything_vitl14.pth'),
neck=dict(type='Feature2Pyramid', embed_dim=1024, rescales=[4, 2, 1, 0.5]),
decode_head=dict(
type='Mask2FormerHead',

View File

@@ -15,7 +15,7 @@ class DINOv2(nn.Module):
super().__init__()
if version == 'large':
self.dinov2 = torch.hub.load('torchhub/facebookresearch_dinov2_main', 'dinov2_vit14', source='local', pretrained=False)
self.dinov2 = torch.hub.load('torchhub/facebookresearch_dinov2_main', 'dinov2_vitl14', source='local', pretrained=False)
else:
raise NotImplementedError