Add local demo

This commit is contained in:
Lihe Yang
2024-01-22 14:20:44 +08:00
committed by GitHub
parent c95c9e7729
commit cc507c488c
3 changed files with 117 additions and 11 deletions

View File

@@ -1,26 +1,22 @@
<div align="center">
<h2>Depth Anything: Unleashing the Power of Large-Scale Unlabeled Data</h2>
[**Lihe Yang**](https://liheyoung.github.io/)<sup>1</sup> · [**Bingyi Kang**](https://scholar.google.com/citations?user=NmHgX-wAAAAJ)<sup>2+</sup> · [**Zilong Huang**](http://speedinghzl.github.io/)<sup>2</sup> · [**Xiaogang Xu**](https://xiaogang00.github.io/)<sup>3,4</sup> · [**Jiashi Feng**](https://sites.google.com/site/jshfeng/)<sup>2</sup> · [**Hengshuang Zhao**](https://hszhao.github.io/)<sup>1+</sup>
[**Lihe Yang**](https://liheyoung.github.io/)<sup>1</sup> · [**Bingyi Kang**](https://scholar.google.com/citations?user=NmHgX-wAAAAJ)<sup>2+</sup> · [**Zilong Huang**](http://speedinghzl.github.io/)<sup>2</sup> · [**Xiaogang Xu**](https://xiaogang00.github.io/)<sup>3,4</sup>, [**Jiashi Feng**](https://sites.google.com/site/jshfeng/)<sup>2</sup> · [**Hengshuang Zhao**](https://hszhao.github.io/)<sup>1+</sup>
<sup>1</sup>The University of Hong Kong · <sup>2</sup>TikTok · <sup>3</sup>Zhejiang Lab · <sup>4</sup>Zhejiang University
<sup>+</sup>corresponding authors
<a href="https://arxiv.org/abs/2401.10891"><img src='https://img.shields.io/badge/arXiv-Depth Anything-red' alt='Paper PDF'></a>
<a href=""><img src='https://img.shields.io/badge/arXiv-Depth Anything-red' alt='Paper PDF'></a>
<a href='https://depth-anything.github.io'><img src='https://img.shields.io/badge/Project_Page-Depth Anything-green' alt='Project Page'></a>
<a href='https://huggingface.co/spaces/LiheYoung/Depth-Anything'><img src='https://img.shields.io/badge/%F0%9F%A4%97%20Hugging%20Face-Spaces-blue'></a>
<a href='https://huggingface.co/spaces/LiheYoung/Depth-Anything'><img src='https://huggingface.co/datasets/huggingface/badges/blob/main/README.md?code=true'></a>
</div>
This work presents Depth Anything, a highly practical solution for robust monocular depth estimation by training on a combination of 1.5M labeled images and **62M+ unlabeled images**.
![teaser](assets/teaser.png)
[![PWC](https://img.shields.io/endpoint.svg?url=https://paperswithcode.com/badge/depth-anything-unleashing-the-power-of-large/monocular-depth-estimation-on-nyu-depth-v2)](https://paperswithcode.com/sota/monocular-depth-estimation-on-nyu-depth-v2?p=depth-anything-unleashing-the-power-of-large)
[![PWC](https://img.shields.io/endpoint.svg?url=https://paperswithcode.com/badge/depth-anything-unleashing-the-power-of-large/monocular-depth-estimation-on-kitti-eigen)](https://paperswithcode.com/sota/monocular-depth-estimation-on-kitti-eigen?p=depth-anything-unleashing-the-power-of-large)
[![PWC](https://img.shields.io/endpoint.svg?url=https://paperswithcode.com/badge/depth-anything-unleashing-the-power-of-large/semantic-segmentation-on-cityscapes-val)](https://paperswithcode.com/sota/semantic-segmentation-on-cityscapes-val?p=depth-anything-unleashing-the-power-of-large)
## News
* **2024-01-22:** Paper, project page, code, models, and demo are released.
@@ -50,7 +46,7 @@ This work presents Depth Anything, a highly practical solution for robust monocu
Here we compare our Depth Anything with the previously best MiDaS v3.1 BEiT<sub>L-512</sub> model.
Please note that the latest MiDaS is also trained on KITTI and NYUv2, while we are not.
Please note that the latest MiDaS is also trained on KITTI and NYUv2, while we do not.
| Method | Params | KITTI || NYUv2 || Sintel || DDAD || ETH3D || DIODE ||
|-|-:|:-:|:-:|:-:|:-:|:-:|:-:|:-:|:-:|:-:|:-:|:-:|:-:|
@@ -98,12 +94,23 @@ For example:
python run.py --encoder vitl --load-from checkpoints/depth_anything_vitl14.pth --img-path demo_images --outdir depth_visualization --localhub
```
### Gradio demo
To use our gradio demo locally:
```bash
python app.py
```
You can also try our [online demo](https://huggingface.co/spaces/LiheYoung/Depth-Anything).
### Import Depth Anything to your project
If you want to use Depth Anything in our own project, you can simply follow [``run.py``](run.py) to load our models and define data pre-processing.
<details>
<summary>Code snippet (note the difference between our data pre-processing and that of MiDaS)</summary>
<summary>Code snippet (note the difference between our data pre-processing and that of MiDaS.)</summary>
```python
from depth_anything.dpt import DPT_DINOv2
@@ -138,6 +145,7 @@ depth = depth_anything(image)
```
</details>
## Citation
If you find this project useful, please consider citing:
@@ -146,7 +154,7 @@ If you find this project useful, please consider citing:
@article{depthanything,
title={Depth Anything: Unleashing the Power of Large-Scale Unlabeled Data},
author={Yang, Lihe and Kang, Bingyi and Huang, Zilong and Xu, Xiaogang and Feng, Jiashi and Zhao, Hengshuang},
journal={arXiv:2401.10891},
year={2024},
journal={arXiv:},
year={2024}
}
```

96
app.py Normal file
View File

@@ -0,0 +1,96 @@
import gradio as gr
import cv2
import numpy as np
import os
from PIL import Image
import torch
import torch.nn.functional as F
from torchvision.transforms import Compose
import tempfile
from gradio_imageslider import ImageSlider
from depth_anything.dpt import DPT_DINOv2
from depth_anything.util.transform import Resize, NormalizeImage, PrepareForNet
css = """
#img-display-container {
max-height: 100vh;
}
#img-display-input {
max-height: 80vh;
}
#img-display-output {
max-height: 80vh;
}
"""
DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'
model = DPT_DINOv2(encoder='vitl', features=256, out_channels=[256, 512, 1024, 1024]).to(DEVICE).eval()
model.load_state_dict(torch.load('checkpoints/depth_anything_vitl14.pth'))
title = "# Depth Anything"
description = """Official demo for **Depth Anything: Unleashing the Power of Large-Scale Unlabeled Data**.
Please refer to our [paper](https://arxiv.org/abs/2401.10891), [project page](https://depth-anything.github.io), or [github](https://github.com/LiheYoung/Depth-Anything) for more details."""
transform = Compose([
Resize(
width=518,
height=518,
resize_target=False,
keep_aspect_ratio=True,
ensure_multiple_of=14,
resize_method='lower_bound',
image_interpolation_method=cv2.INTER_CUBIC,
),
NormalizeImage(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
PrepareForNet(),
])
@torch.no_grad()
def predict_depth(model, image):
return model(image)
with gr.Blocks(css=css) as demo:
gr.Markdown(title)
gr.Markdown(description)
gr.Markdown("### Depth Prediction demo")
gr.Markdown("You can slide the output to compare the depth prediction with input image")
with gr.Row():
input_image = gr.Image(label="Input Image", type='numpy', elem_id='img-display-input')
depth_image_slider = ImageSlider(label="Depth Map with Slider View", elem_id='img-display-output', position=0)
raw_file = gr.File(label="16-bit raw depth (can be considered as disparity)")
submit = gr.Button("Submit")
def on_submit(image):
original_image = image.copy()
h, w = image.shape[:2]
image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB) / 255.0
image = transform({'image': image})['image']
image = torch.from_numpy(image).unsqueeze(0).to(DEVICE)
depth = predict_depth(model, image)
depth = F.interpolate(depth[None], (h, w), mode='bilinear', align_corners=False)[0, 0]
raw_depth = Image.fromarray(depth.cpu().numpy().astype('uint16'))
tmp = tempfile.NamedTemporaryFile(suffix='.png', delete=False)
raw_depth.save(tmp.name)
depth = (depth - depth.min()) / (depth.max() - depth.min()) * 255.0
depth = depth.cpu().numpy().astype(np.uint8)
colored_depth = cv2.applyColorMap(depth, cv2.COLORMAP_INFERNO)[:, :, ::-1]
return [(original_image, colored_depth), tmp.name]
submit.click(on_submit, inputs=[input_image], outputs=[depth_image_slider, raw_file])
example_files = os.listdir('assets/examples')
example_files.sort()
example_files = [os.path.join('assets/examples', filename) for filename in example_files]
examples = gr.Examples(examples=example_files, inputs=[input_image], outputs=[depth_image_slider, raw_file], fn=on_submit, cache_examples=False)
if __name__ == '__main__':
demo.queue().launch()

View File

@@ -1,3 +1,5 @@
gradio_imageslider
gradio==4.14.0
torch
torchvision
opencv-python