Support video depth visualization

2026-03-12 21:09:47 +00:00 · 2024-01-25 18:27:01 +08:00
parent 1ad37271a1
commit 5761e0b520
6 changed files with 45 additions and 15 deletions
--- a/README.md
+++ b/README.md
@@ -19,7 +19,10 @@ This work presents Depth Anything, a highly practical solution for robust monocu

 ## News

-* **2024-01-22:** Paper, project page, code, models, and demo are released.
+* **2024-01-25:** Support [video depth visualization](./run_video.py). Also, both [online demo](https://huggingface.co/spaces/LiheYoung/Depth-Anything) and [local demo](./app.py) support video input.
+* **2024-01-23:** The new ControlNet based on Depth Anything is integrated into [ControlNet WebUI](https://github.com/Mikubill/sd-webui-controlnet) and [ComfyUI's ControlNet](https://github.com/Fannovel16/comfyui_controlnet_aux).
+* **2024-01-23:** Depth Anything [ONNX](https://github.com/fabio-sim/Depth-Anything-ONNX) and [TensorRT](https://github.com/spacewalk01/depth-anything-tensorrt) versions are supported.
+* **2024-01-22:** Paper, project page, code, models, and demo ([HuggingFace](https://huggingface.co/spaces/LiheYoung/Depth-Anything), [OpenXLab](https://openxlab.org.cn/apps/detail/yyfan/depth_anything)) are released.


 ## Features of Depth Anything
@@ -35,7 +38,7 @@ This work presents Depth Anything, a highly practical solution for robust monocu

 - **Better depth-conditioned ControlNet**

-    We re-train **a better depth-conditioned ControlNet** based on Depth Anything. It offers more precise synthesis than the previous MiDaS-based ControlNet. Please refer [here](./controlnet/) for details.
+    We re-train **a better depth-conditioned ControlNet** based on Depth Anything. It offers more precise synthesis than the previous MiDaS-based ControlNet. Please refer [here](./controlnet/) for details. You can also use our new ControlNet based on Depth Anything in [ControlNet WebUI](https://github.com/Mikubill/sd-webui-controlnet).

 - **Downstream high-level scene understanding**

@@ -68,7 +71,7 @@ We provide three models of varying scales for robust relative depth estimation:
 | Depth-Anything-Base | 97.5M | 13 | 9 | 6 |
 | Depth-Anything-Large | 335.3M | 20 | 13 | 12 |

-Note that the V100 and A100 inference time (*without TensorRT*) is computed by excluding the pre-processing and post-processing stages, whereas the last column RTX4090 (*with TensorRT*) is computed by including these two stages. See [here]() for details.
+Note that the V100 and A100 inference time (*without TensorRT*) is computed by excluding the pre-processing and post-processing stages, whereas the last column RTX4090 (*with TensorRT*) is computed by including these two stages (please refer to [Depth-Anything-TensorRT](https://github.com/spacewalk01/depth-anything-tensorrt)).

 You can easily load our pre-trained models by:
 ```python
@@ -115,9 +118,13 @@ For the ``img-path``, you can either 1) point it to an image directory storing a

 For example:
 ```bash
-python run.py --encoder vitl --img-path assets/examples --outdir depth_visualization
+python run.py --encoder vitl --img-path assets/examples --outdir depth_vis
 ```

+**If you want to use Depth Anything on videos:**
+```bash
+python run_video.py --encoder vitl --video-path assets/examples_video --outdir video_depth_vis
+```

 ### Gradio demo

@@ -144,7 +151,7 @@ import cv2
 import torch

 encoder = 'vits' # can also be 'vitb' or 'vitl'
-depth_anything = DepthAnything.from_pretrained('LiheYoung/depth_anything_{:}14'.format(encoder))
+depth_anything = DepthAnything.from_pretrained('LiheYoung/depth_anything_{:}14'.format(encoder)).eval()

 transform = Compose([
    Resize(
@@ -169,6 +176,23 @@ depth = depth_anything(image)
 ```
 </details>

+### Do not want to manually define image pre-processing and download our model definition files?
+
+Easily use Depth Anything through ``transformers``! Please refer to [these instructions](https://huggingface.co/LiheYoung/depth-anything-small-hf) (credit to [@niels](https://huggingface.co/nielsr)).
+
+<details>
+<summary>Click here for a brief demo:</summary>
+
+```python
+from transformers import pipeline
+from PIL import Image
+
+image = Image.open('Your-image-path')
+pipe = pipeline(task="depth-estimation", model="LiheYoung/depth-anything-small-hf")
+depth = pipe(image)["depth"]
+```
+</details>
+
 ## Community Support

 **We sincerely appreciate all the extentions built on our Depth Anything from the community. Thank you a lot!**
@@ -177,10 +201,17 @@ Here we list the extensions we have found:
 - Depth Anything ONNX: https://github.com/fabio-sim/Depth-Anything-ONNX
 - Depth Anything TensorRT: https://github.com/spacewalk01/depth-anything-tensorrt
 - Depth Anything in ControlNet WebUI: https://github.com/Mikubill/sd-webui-controlnet
+- Depth Anything in ComfyUI's ControlNet: https://github.com/Fannovel16/comfyui_controlnet_aux
 - Depth Anything in X-AnyLabeling: https://github.com/CVHub520/X-AnyLabeling
+- Depth Anything in OpenXLab: https://openxlab.org.cn/apps/detail/yyfan/depth_anything

 If you have your amazing projects supporting or improving (*e.g.*, speed) Depth Anything, please feel free to drop an issue. We will add them here.

+
+## Acknowledgement
+
+We would like to express our deepest gratitude to [AK(@_akhaliq)](https://twitter.com/_akhaliq) and the awesome HuggingFace team ([@niels](https://huggingface.co/nielsr), [@hysts](https://huggingface.co/hysts), and [@yuvraj](https://huggingface.co/ysharma)) for helping improve the online demo and build the HF models.
+
 ## Citation

 If you find this project useful, please consider citing:
--- a/app.py
+++ b/app.py
@@ -28,6 +28,7 @@ model = DepthAnything.from_pretrained('LiheYoung/depth_anything_vitl14').to(DEVI

 title = "# Depth Anything"
 description = """Official demo for **Depth Anything: Unleashing the Power of Large-Scale Unlabeled Data**.
+
 Please refer to our [paper](https://arxiv.org/abs/2401.10891), [project page](https://depth-anything.github.io), or [github](https://github.com/LiheYoung/Depth-Anything) for more details."""

 transform = Compose([
@@ -56,7 +57,7 @@ with gr.Blocks(css=css) as demo:

    with gr.Row():
        input_image = gr.Image(label="Input Image", type='numpy', elem_id='img-display-input')
-        depth_image_slider = ImageSlider(label="Depth Map with Slider View", elem_id='img-display-output', position=0)
+        depth_image_slider = ImageSlider(label="Depth Map with Slider View", elem_id='img-display-output', position=0.5)
    raw_file = gr.File(label="16-bit raw depth (can be considered as disparity)")
    submit = gr.Button("Submit")

@@ -88,7 +89,7 @@ with gr.Blocks(css=css) as demo:
    example_files.sort()
    example_files = [os.path.join('assets/examples', filename) for filename in example_files]
    examples = gr.Examples(examples=example_files, inputs=[input_image], outputs=[depth_image_slider, raw_file], fn=on_submit, cache_examples=False)
-
+    

 if __name__ == '__main__':
    demo.queue().launch()
--- a/assets/examples_video/davis_dolphins.mp4
+++ b/assets/examples_video/davis_dolphins.mp4
--- a/assets/examples_video/davis_rollercoaster.mp4
+++ b/assets/examples_video/davis_rollercoaster.mp4
--- a/assets/examples_video/davis_seasnake.mp4
+++ b/assets/examples_video/davis_seasnake.mp4
--- a/run.py
+++ b/run.py
@@ -28,13 +28,11 @@ if __name__ == '__main__':
    
    DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'
    
-    depth_anything = DepthAnything.from_pretrained('LiheYoung/depth_anything_{}14'.format(args.encoder)).to(DEVICE)
+    depth_anything = DepthAnything.from_pretrained('LiheYoung/depth_anything_{}14'.format(args.encoder)).to(DEVICE).eval()
    
    total_params = sum(param.numel() for param in depth_anything.parameters())
    print('Total parameters: {:.2f}M'.format(total_params / 1e6))
    
-    depth_anything.eval()
-    
    transform = Compose([
        Resize(
            width=518,
@@ -57,9 +55,11 @@ if __name__ == '__main__':
            filenames = [args.img_path]
    else:
        filenames = os.listdir(args.img_path)
-        filenames = [os.path.join(args.img_path, filename) for filename in filenames]
+        filenames = [os.path.join(args.img_path, filename) for filename in filenames if not filename.startswith('.')]
        filenames.sort()
    
+    os.makedirs(args.outdir, exist_ok=True)
+    
    for filename in tqdm(filenames):
        raw_image = cv2.imread(filename)
        image = cv2.cvtColor(raw_image, cv2.COLOR_BGR2RGB) / 255.0
@@ -78,9 +78,6 @@ if __name__ == '__main__':
        depth = depth.cpu().numpy().astype(np.uint8)
        depth_color = cv2.applyColorMap(depth, cv2.COLORMAP_INFERNO)
        
-        os.makedirs(args.outdir, exist_ok=True)
-        filename = os.path.basename(filename)
-        
        split_region = np.ones((raw_image.shape[0], margin_width, 3), dtype=np.uint8) * 255
        combined_results = cv2.hconcat([raw_image, split_region, depth_color])
        
@@ -99,4 +96,5 @@ if __name__ == '__main__':
        
        final_result = cv2.vconcat([caption_space, combined_results])
        
-        cv2.imwrite(os.path.join(args.outdir, filename[:filename.find('.')] + '_img_depth.png'), final_result)
+        filename = os.path.basename(filename)
+        cv2.imwrite(os.path.join(args.outdir, filename[:filename.rfind('.')] + '_img_depth.png'), final_result)