From cc507c488cd4902d95bc7d2bd248c95e83e39588 Mon Sep 17 00:00:00 2001
From: Lihe Yang <lihe.yang.cs@gmail.com>
Date: Mon, 22 Jan 2024 14:20:44 +0800
Subject: [PATCH] Add local demo

---
 README.md        | 30 +++++++++------
 app.py           | 96 ++++++++++++++++++++++++++++++++++++++++++++++++
 requirements.txt |  2 +
 3 files changed, 117 insertions(+), 11 deletions(-)
 create mode 100644 app.py
diff --git a/README.md b/README.md
index 8f46d61..6297b28 100644
--- a/README.md
+++ b/README.md
@@ -1,26 +1,22 @@
 <div align="center">
 <h2>Depth Anything: Unleashing the Power of Large-Scale Unlabeled Data</h2>
 
-[**Lihe Yang**](https://liheyoung.github.io/)<sup>1</sup> · [**Bingyi Kang**](https://scholar.google.com/citations?user=NmHgX-wAAAAJ)<sup>2+</sup> · [**Zilong Huang**](http://speedinghzl.github.io/)<sup>2</sup> · [**Xiaogang Xu**](https://xiaogang00.github.io/)<sup>3,4</sup> · [**Jiashi Feng**](https://sites.google.com/site/jshfeng/)<sup>2</sup> · [**Hengshuang Zhao**](https://hszhao.github.io/)<sup>1+</sup>
+[**Lihe Yang**](https://liheyoung.github.io/)<sup>1</sup> · [**Bingyi Kang**](https://scholar.google.com/citations?user=NmHgX-wAAAAJ)<sup>2+</sup> · [**Zilong Huang**](http://speedinghzl.github.io/)<sup>2</sup> · [**Xiaogang Xu**](https://xiaogang00.github.io/)<sup>3,4</sup>, [**Jiashi Feng**](https://sites.google.com/site/jshfeng/)<sup>2</sup> · [**Hengshuang Zhao**](https://hszhao.github.io/)<sup>1+</sup>
 
 <sup>1</sup>The University of Hong Kong · <sup>2</sup>TikTok · <sup>3</sup>Zhejiang Lab · <sup>4</sup>Zhejiang University
 
 <sup>+</sup>corresponding authors
 
-<a href="https://arxiv.org/abs/2401.10891"><img src='https://img.shields.io/badge/arXiv-Depth Anything-red' alt='Paper PDF'></a>
+<a href=""><img src='https://img.shields.io/badge/arXiv-Depth Anything-red' alt='Paper PDF'></a>
 <a href='https://depth-anything.github.io'><img src='https://img.shields.io/badge/Project_Page-Depth Anything-green' alt='Project Page'></a>
 <a href='https://huggingface.co/spaces/LiheYoung/Depth-Anything'><img src='https://img.shields.io/badge/%F0%9F%A4%97%20Hugging%20Face-Spaces-blue'></a>
-
+<a href='https://huggingface.co/spaces/LiheYoung/Depth-Anything'><img src='https://huggingface.co/datasets/huggingface/badges/blob/main/README.md?code=true'></a>
 </div>
 
 This work presents Depth Anything, a highly practical solution for robust monocular depth estimation by training on a combination of 1.5M labeled images and **62M+ unlabeled images**.
 
 ![teaser](assets/teaser.png)
 
-[![PWC](https://img.shields.io/endpoint.svg?url=https://paperswithcode.com/badge/depth-anything-unleashing-the-power-of-large/monocular-depth-estimation-on-nyu-depth-v2)](https://paperswithcode.com/sota/monocular-depth-estimation-on-nyu-depth-v2?p=depth-anything-unleashing-the-power-of-large)
-[![PWC](https://img.shields.io/endpoint.svg?url=https://paperswithcode.com/badge/depth-anything-unleashing-the-power-of-large/monocular-depth-estimation-on-kitti-eigen)](https://paperswithcode.com/sota/monocular-depth-estimation-on-kitti-eigen?p=depth-anything-unleashing-the-power-of-large)
-[![PWC](https://img.shields.io/endpoint.svg?url=https://paperswithcode.com/badge/depth-anything-unleashing-the-power-of-large/semantic-segmentation-on-cityscapes-val)](https://paperswithcode.com/sota/semantic-segmentation-on-cityscapes-val?p=depth-anything-unleashing-the-power-of-large)
-
 ## News
 
 * **2024-01-22:** Paper, project page, code, models, and demo are released.
@@ -50,7 +46,7 @@ This work presents Depth Anything, a highly practical solution for robust monocu
 
 Here we compare our Depth Anything with the previously best MiDaS v3.1 BEiT<sub>L-512</sub> model.
 
-Please note that the latest MiDaS is also trained on KITTI and NYUv2, while we are not.
+Please note that the latest MiDaS is also trained on KITTI and NYUv2, while we do not.
 
 | Method | Params | KITTI || NYUv2 || Sintel || DDAD || ETH3D || DIODE ||
 |-|-:|:-:|:-:|:-:|:-:|:-:|:-:|:-:|:-:|:-:|:-:|:-:|:-:|
@@ -98,12 +94,23 @@ For example:
 python run.py --encoder vitl --load-from checkpoints/depth_anything_vitl14.pth --img-path demo_images --outdir depth_visualization --localhub
 ```
 
+
+### Gradio demo
+
+To use our gradio demo locally:
+
+```bash
+python app.py
+```
+
+You can also try our [online demo](https://huggingface.co/spaces/LiheYoung/Depth-Anything).
+
 ### Import Depth Anything to your project
 
 If you want to use Depth Anything in our own project, you can simply follow [``run.py``](run.py) to load our models and define data pre-processing. 
 
 <details>
-<summary>Code snippet (note the difference between our data pre-processing and that of MiDaS)</summary>
+<summary>Code snippet (note the difference between our data pre-processing and that of MiDaS.)</summary>
 
 ```python
 from depth_anything.dpt import DPT_DINOv2
@@ -138,6 +145,7 @@ depth = depth_anything(image)
 ```
 </details>
 
+
 ## Citation
 
 If you find this project useful, please consider citing:
@@ -146,7 +154,7 @@ If you find this project useful, please consider citing:
 @article{depthanything,
       title={Depth Anything: Unleashing the Power of Large-Scale Unlabeled Data}, 
       author={Yang, Lihe and Kang, Bingyi and Huang, Zilong and Xu, Xiaogang and Feng, Jiashi and Zhao, Hengshuang},
-      journal={arXiv:2401.10891},
-      year={2024},
+      journal={arXiv:},
+      year={2024}
 }
 ```
\ No newline at end of file
diff --git a/app.py b/app.py
new file mode 100644
index 0000000..11c708f
--- /dev/null
+++ b/app.py
@@ -0,0 +1,96 @@
+import gradio as gr
+import cv2
+import numpy as np
+import os
+from PIL import Image
+import torch
+import torch.nn.functional as F
+from torchvision.transforms import Compose
+import tempfile
+from gradio_imageslider import ImageSlider
+
+from depth_anything.dpt import DPT_DINOv2
+from depth_anything.util.transform import Resize, NormalizeImage, PrepareForNet
+
+css = """
+#img-display-container {
+    max-height: 100vh;
+    }
+#img-display-input {
+    max-height: 80vh;
+    }
+#img-display-output {
+    max-height: 80vh;
+    }
+"""
+DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'
+model = DPT_DINOv2(encoder='vitl', features=256, out_channels=[256, 512, 1024, 1024]).to(DEVICE).eval()
+model.load_state_dict(torch.load('checkpoints/depth_anything_vitl14.pth'))
+
+title = "# Depth Anything"
+description = """Official demo for **Depth Anything: Unleashing the Power of Large-Scale Unlabeled Data**.
+Please refer to our [paper](https://arxiv.org/abs/2401.10891), [project page](https://depth-anything.github.io), or [github](https://github.com/LiheYoung/Depth-Anything) for more details."""
+
+transform = Compose([
+        Resize(
+            width=518,
+            height=518,
+            resize_target=False,
+            keep_aspect_ratio=True,
+            ensure_multiple_of=14,
+            resize_method='lower_bound',
+            image_interpolation_method=cv2.INTER_CUBIC,
+        ),
+        NormalizeImage(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
+        PrepareForNet(),
+])
+
+@torch.no_grad()
+def predict_depth(model, image):
+    return model(image)
+
+
+with gr.Blocks(css=css) as demo:
+    gr.Markdown(title)
+    gr.Markdown(description)
+    gr.Markdown("### Depth Prediction demo")
+    gr.Markdown("You can slide the output to compare the depth prediction with input image")
+
+    with gr.Row():
+        input_image = gr.Image(label="Input Image", type='numpy', elem_id='img-display-input')
+        depth_image_slider = ImageSlider(label="Depth Map with Slider View", elem_id='img-display-output', position=0)
+    raw_file = gr.File(label="16-bit raw depth (can be considered as disparity)")
+    submit = gr.Button("Submit")
+
+    def on_submit(image):
+        original_image = image.copy()
+
+        h, w = image.shape[:2]
+
+        image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB) / 255.0
+        image = transform({'image': image})['image']
+        image = torch.from_numpy(image).unsqueeze(0).to(DEVICE)
+
+        depth = predict_depth(model, image)
+        depth = F.interpolate(depth[None], (h, w), mode='bilinear', align_corners=False)[0, 0]
+
+        raw_depth = Image.fromarray(depth.cpu().numpy().astype('uint16'))
+        tmp = tempfile.NamedTemporaryFile(suffix='.png', delete=False)
+        raw_depth.save(tmp.name)
+
+        depth = (depth - depth.min()) / (depth.max() - depth.min()) * 255.0
+        depth = depth.cpu().numpy().astype(np.uint8)
+        colored_depth = cv2.applyColorMap(depth, cv2.COLORMAP_INFERNO)[:, :, ::-1]
+
+        return [(original_image, colored_depth), tmp.name]
+
+    submit.click(on_submit, inputs=[input_image], outputs=[depth_image_slider, raw_file])
+
+    example_files = os.listdir('assets/examples')
+    example_files.sort()
+    example_files = [os.path.join('assets/examples', filename) for filename in example_files]
+    examples = gr.Examples(examples=example_files, inputs=[input_image], outputs=[depth_image_slider, raw_file], fn=on_submit, cache_examples=False)
+
+
+if __name__ == '__main__':
+    demo.queue().launch()
diff --git a/requirements.txt b/requirements.txt
index 4f8641a..0d750ee 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,3 +1,5 @@
+gradio_imageslider
+gradio==4.14.0
 torch
 torchvision
 opencv-python
\ No newline at end of file