initial commit
This commit is contained in:
6
extensions-builtin/ScuNET/preload.py
Executable file
6
extensions-builtin/ScuNET/preload.py
Executable file
@@ -0,0 +1,6 @@
|
||||
import os
|
||||
from modules import paths
|
||||
|
||||
|
||||
def preload(parser):
|
||||
parser.add_argument("--scunet-models-path", type=str, help="Path to directory with ScuNET model file(s).", default=os.path.join(paths.models_path, 'ScuNET'))
|
||||
74
extensions-builtin/ScuNET/scripts/scunet_model.py
Executable file
74
extensions-builtin/ScuNET/scripts/scunet_model.py
Executable file
@@ -0,0 +1,74 @@
|
||||
import sys
|
||||
|
||||
import PIL.Image
|
||||
|
||||
import modules.upscaler
|
||||
from modules import devices, errors, modelloader, script_callbacks, shared, upscaler_utils
|
||||
|
||||
|
||||
class UpscalerScuNET(modules.upscaler.Upscaler):
|
||||
def __init__(self, dirname):
|
||||
self.name = "ScuNET"
|
||||
self.model_name = "ScuNET GAN"
|
||||
self.model_name2 = "ScuNET PSNR"
|
||||
self.model_url = "https://github.com/cszn/KAIR/releases/download/v1.0/scunet_color_real_gan.pth"
|
||||
self.model_url2 = "https://github.com/cszn/KAIR/releases/download/v1.0/scunet_color_real_psnr.pth"
|
||||
self.user_path = dirname
|
||||
super().__init__()
|
||||
model_paths = self.find_models(ext_filter=[".pth"])
|
||||
scalers = []
|
||||
add_model2 = True
|
||||
for file in model_paths:
|
||||
if file.startswith("http"):
|
||||
name = self.model_name
|
||||
else:
|
||||
name = modelloader.friendly_name(file)
|
||||
if name == self.model_name2 or file == self.model_url2:
|
||||
add_model2 = False
|
||||
try:
|
||||
scaler_data = modules.upscaler.UpscalerData(name, file, self, 4)
|
||||
scalers.append(scaler_data)
|
||||
except Exception:
|
||||
errors.report(f"Error loading ScuNET model: {file}", exc_info=True)
|
||||
if add_model2:
|
||||
scaler_data2 = modules.upscaler.UpscalerData(self.model_name2, self.model_url2, self)
|
||||
scalers.append(scaler_data2)
|
||||
self.scalers = scalers
|
||||
|
||||
def do_upscale(self, img: PIL.Image.Image, selected_file):
|
||||
devices.torch_gc()
|
||||
try:
|
||||
model = self.load_model(selected_file)
|
||||
except Exception as e:
|
||||
print(f"ScuNET: Unable to load model from {selected_file}: {e}", file=sys.stderr)
|
||||
return img
|
||||
|
||||
img = upscaler_utils.upscale_2(
|
||||
img,
|
||||
model,
|
||||
tile_size=shared.opts.SCUNET_tile,
|
||||
tile_overlap=shared.opts.SCUNET_tile_overlap,
|
||||
scale=1, # ScuNET is a denoising model, not an upscaler
|
||||
desc='ScuNET',
|
||||
)
|
||||
devices.torch_gc()
|
||||
return img
|
||||
|
||||
def load_model(self, path: str):
|
||||
device = devices.get_device_for('scunet')
|
||||
if path.startswith("http"):
|
||||
# TODO: this doesn't use `path` at all?
|
||||
filename = modelloader.load_file_from_url(self.model_url, model_dir=self.model_download_path, file_name=f"{self.name}.pth")
|
||||
else:
|
||||
filename = path
|
||||
return modelloader.load_spandrel_model(filename, device=device, expected_architecture='SCUNet')
|
||||
|
||||
|
||||
def on_ui_settings():
|
||||
import gradio as gr
|
||||
|
||||
shared.opts.add_option("SCUNET_tile", shared.OptionInfo(256, "Tile size for SCUNET upscalers.", gr.Slider, {"minimum": 0, "maximum": 512, "step": 16}, section=('upscaling', "Upscaling")).info("0 = no tiling"))
|
||||
shared.opts.add_option("SCUNET_tile_overlap", shared.OptionInfo(8, "Tile overlap for SCUNET upscalers.", gr.Slider, {"minimum": 0, "maximum": 64, "step": 1}, section=('upscaling', "Upscaling")).info("Low values = visible seam"))
|
||||
|
||||
|
||||
script_callbacks.on_ui_settings(on_ui_settings)
|
||||
6
extensions-builtin/SwinIR/preload.py
Executable file
6
extensions-builtin/SwinIR/preload.py
Executable file
@@ -0,0 +1,6 @@
|
||||
import os
|
||||
from modules import paths
|
||||
|
||||
|
||||
def preload(parser):
|
||||
parser.add_argument("--swinir-models-path", type=str, help="Path to directory with SwinIR model file(s).", default=os.path.join(paths.models_path, 'SwinIR'))
|
||||
98
extensions-builtin/SwinIR/scripts/swinir_model.py
Executable file
98
extensions-builtin/SwinIR/scripts/swinir_model.py
Executable file
@@ -0,0 +1,98 @@
|
||||
import logging
|
||||
import sys
|
||||
|
||||
import torch
|
||||
from PIL import Image
|
||||
|
||||
from modules import devices, modelloader, script_callbacks, shared, upscaler_utils
|
||||
from modules.upscaler import Upscaler, UpscalerData
|
||||
from modules_forge.utils import prepare_free_memory
|
||||
|
||||
SWINIR_MODEL_URL = "https://github.com/JingyunLiang/SwinIR/releases/download/v0.0/003_realSR_BSRGAN_DFOWMFC_s64w8_SwinIR-L_x4_GAN.pth"
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class UpscalerSwinIR(Upscaler):
|
||||
def __init__(self, dirname):
|
||||
self._cached_model = None # keep the model when SWIN_torch_compile is on to prevent re-compile every runs
|
||||
self._cached_model_config = None # to clear '_cached_model' when changing model (v1/v2) or settings
|
||||
self.name = "SwinIR"
|
||||
self.model_url = SWINIR_MODEL_URL
|
||||
self.model_name = "SwinIR 4x"
|
||||
self.user_path = dirname
|
||||
super().__init__()
|
||||
scalers = []
|
||||
model_files = self.find_models(ext_filter=[".pt", ".pth"])
|
||||
for model in model_files:
|
||||
if model.startswith("http"):
|
||||
name = self.model_name
|
||||
else:
|
||||
name = modelloader.friendly_name(model)
|
||||
model_data = UpscalerData(name, model, self)
|
||||
scalers.append(model_data)
|
||||
self.scalers = scalers
|
||||
|
||||
def do_upscale(self, img: Image.Image, model_file: str) -> Image.Image:
|
||||
prepare_free_memory()
|
||||
|
||||
current_config = (model_file, shared.opts.SWIN_tile)
|
||||
|
||||
if self._cached_model_config == current_config:
|
||||
model = self._cached_model
|
||||
else:
|
||||
try:
|
||||
model = self.load_model(model_file)
|
||||
except Exception as e:
|
||||
print(f"Failed loading SwinIR model {model_file}: {e}", file=sys.stderr)
|
||||
return img
|
||||
self._cached_model = model
|
||||
self._cached_model_config = current_config
|
||||
|
||||
img = upscaler_utils.upscale_2(
|
||||
img,
|
||||
model,
|
||||
tile_size=shared.opts.SWIN_tile,
|
||||
tile_overlap=shared.opts.SWIN_tile_overlap,
|
||||
scale=model.scale,
|
||||
desc="SwinIR",
|
||||
)
|
||||
devices.torch_gc()
|
||||
return img
|
||||
|
||||
def load_model(self, path, scale=4):
|
||||
if path.startswith("http"):
|
||||
filename = modelloader.load_file_from_url(
|
||||
url=path,
|
||||
model_dir=self.model_download_path,
|
||||
file_name=f"{self.model_name.replace(' ', '_')}.pth",
|
||||
)
|
||||
else:
|
||||
filename = path
|
||||
|
||||
model_descriptor = modelloader.load_spandrel_model(
|
||||
filename,
|
||||
device=self._get_device(),
|
||||
prefer_half=(devices.dtype == torch.float16),
|
||||
expected_architecture="SwinIR",
|
||||
)
|
||||
if getattr(shared.opts, 'SWIN_torch_compile', False):
|
||||
try:
|
||||
model_descriptor.model.compile()
|
||||
except Exception:
|
||||
logger.warning("Failed to compile SwinIR model, fallback to JIT", exc_info=True)
|
||||
return model_descriptor
|
||||
|
||||
def _get_device(self):
|
||||
return devices.get_device_for('swinir')
|
||||
|
||||
|
||||
def on_ui_settings():
|
||||
import gradio as gr
|
||||
|
||||
shared.opts.add_option("SWIN_tile", shared.OptionInfo(192, "Tile size for all SwinIR.", gr.Slider, {"minimum": 16, "maximum": 512, "step": 16}, section=('upscaling', "Upscaling")))
|
||||
shared.opts.add_option("SWIN_tile_overlap", shared.OptionInfo(8, "Tile overlap, in pixels for SwinIR. Low values = visible seam.", gr.Slider, {"minimum": 0, "maximum": 48, "step": 1}, section=('upscaling', "Upscaling")))
|
||||
shared.opts.add_option("SWIN_torch_compile", shared.OptionInfo(False, "Use torch.compile to accelerate SwinIR.", gr.Checkbox, {"interactive": True}, section=('upscaling', "Upscaling")).info("Takes longer on first run"))
|
||||
|
||||
|
||||
script_callbacks.on_ui_settings(on_ui_settings)
|
||||
87
extensions-builtin/extra-options-section/scripts/extra_options_section.py
Executable file
87
extensions-builtin/extra-options-section/scripts/extra_options_section.py
Executable file
@@ -0,0 +1,87 @@
|
||||
import math
|
||||
|
||||
import gradio as gr
|
||||
from modules import scripts, shared, ui_components, ui_settings, infotext_utils, errors
|
||||
from modules.ui_components import FormColumn
|
||||
|
||||
|
||||
class ExtraOptionsSection(scripts.Script):
|
||||
section = "extra_options"
|
||||
|
||||
def __init__(self):
|
||||
self.comps = None
|
||||
self.setting_names = None
|
||||
|
||||
def title(self):
|
||||
return "Extra options"
|
||||
|
||||
def show(self, is_img2img):
|
||||
return scripts.AlwaysVisible
|
||||
|
||||
def ui(self, is_img2img):
|
||||
self.comps = []
|
||||
self.setting_names = []
|
||||
self.infotext_fields = []
|
||||
extra_options = shared.opts.extra_options_img2img if is_img2img else shared.opts.extra_options_txt2img
|
||||
elem_id_tabname = "extra_options_" + ("img2img" if is_img2img else "txt2img")
|
||||
|
||||
not_allowed = ['sd_model_checkpoint', 'sd_vae', 'CLIP_stop_at_last_layers', 'forge_additional_modules']
|
||||
for na in not_allowed:
|
||||
if na in extra_options:
|
||||
extra_options.remove(na)
|
||||
|
||||
mapping = {k: v for v, k in infotext_utils.infotext_to_setting_name_mapping}
|
||||
|
||||
with gr.Blocks() as interface:
|
||||
with gr.Accordion("Options", open=False, elem_id=elem_id_tabname) if shared.opts.extra_options_accordion and extra_options else gr.Group(elem_id=elem_id_tabname):
|
||||
|
||||
row_count = math.ceil(len(extra_options) / shared.opts.extra_options_cols)
|
||||
|
||||
for row in range(row_count):
|
||||
with gr.Row():
|
||||
for col in range(shared.opts.extra_options_cols):
|
||||
index = row * shared.opts.extra_options_cols + col
|
||||
if index >= len(extra_options):
|
||||
break
|
||||
|
||||
setting_name = extra_options[index]
|
||||
|
||||
with FormColumn():
|
||||
try:
|
||||
comp = ui_settings.create_setting_component(setting_name)
|
||||
except KeyError:
|
||||
errors.report(f"Can't add extra options for {setting_name} in ui")
|
||||
continue
|
||||
|
||||
self.comps.append(comp)
|
||||
self.setting_names.append(setting_name)
|
||||
|
||||
setting_infotext_name = mapping.get(setting_name)
|
||||
if setting_infotext_name is not None:
|
||||
self.infotext_fields.append((comp, setting_infotext_name))
|
||||
|
||||
def get_settings_values():
|
||||
res = [ui_settings.get_value_for_setting(key) for key in self.setting_names]
|
||||
return res[0] if len(res) == 1 else res
|
||||
|
||||
interface.load(fn=get_settings_values, inputs=[], outputs=self.comps, queue=False, show_progress=False)
|
||||
|
||||
return self.comps
|
||||
|
||||
def before_process(self, p, *args):
|
||||
for name, value in zip(self.setting_names, args):
|
||||
if name not in p.override_settings:
|
||||
p.override_settings[name] = value
|
||||
|
||||
|
||||
shared.options_templates.update(shared.options_section(('settings_in_ui', "Settings in UI", "ui"), {
|
||||
"settings_in_ui": shared.OptionHTML("""
|
||||
This page allows you to add some settings to the main interface of txt2img and img2img tabs.
|
||||
"""),
|
||||
"extra_options_txt2img": shared.OptionInfo([], "Settings for txt2img", ui_components.DropdownMulti, lambda: {"choices": list(shared.opts.data_labels.keys())}).js("info", "settingsHintsShowQuicksettings").info("setting entries that also appear in txt2img interfaces").needs_reload_ui(),
|
||||
"extra_options_img2img": shared.OptionInfo([], "Settings for img2img", ui_components.DropdownMulti, lambda: {"choices": list(shared.opts.data_labels.keys())}).js("info", "settingsHintsShowQuicksettings").info("setting entries that also appear in img2img interfaces").needs_reload_ui(),
|
||||
"extra_options_cols": shared.OptionInfo(1, "Number of columns for added settings", gr.Slider, {"step": 1, "minimum": 1, "maximum": 20}).info("displayed amount will depend on the actual browser window width").needs_reload_ui(),
|
||||
"extra_options_accordion": shared.OptionInfo(False, "Place added settings into an accordion").needs_reload_ui()
|
||||
}))
|
||||
|
||||
|
||||
185
extensions-builtin/forge_legacy_preprocessors/.gitignore
vendored
Executable file
185
extensions-builtin/forge_legacy_preprocessors/.gitignore
vendored
Executable file
@@ -0,0 +1,185 @@
|
||||
# Byte-compiled / optimized / DLL files
|
||||
__pycache__/
|
||||
*.py[cod]
|
||||
*$py.class
|
||||
|
||||
# C extensions
|
||||
*.so
|
||||
|
||||
# Distribution / packaging
|
||||
.Python
|
||||
build/
|
||||
develop-eggs/
|
||||
dist/
|
||||
downloads/
|
||||
eggs/
|
||||
.eggs/
|
||||
lib/
|
||||
lib64/
|
||||
parts/
|
||||
sdist/
|
||||
var/
|
||||
wheels/
|
||||
share/python-wheels/
|
||||
*.egg-info/
|
||||
.installed.cfg
|
||||
*.egg
|
||||
MANIFEST
|
||||
|
||||
# PyInstaller
|
||||
# Usually these files are written by a python script from a template
|
||||
# before PyInstaller builds the exe, so as to inject date/other infos into it.
|
||||
*.manifest
|
||||
*.spec
|
||||
|
||||
# Installer logs
|
||||
pip-log.txt
|
||||
pip-delete-this-directory.txt
|
||||
|
||||
# Unit test / coverage reports
|
||||
htmlcov/
|
||||
.tox/
|
||||
.nox/
|
||||
.coverage
|
||||
.coverage.*
|
||||
.cache
|
||||
nosetests.xml
|
||||
coverage.xml
|
||||
*.cover
|
||||
*.py,cover
|
||||
.hypothesis/
|
||||
.pytest_cache/
|
||||
cover/
|
||||
|
||||
# Translations
|
||||
*.mo
|
||||
*.pot
|
||||
|
||||
# Django stuff:
|
||||
*.log
|
||||
local_settings.py
|
||||
db.sqlite3
|
||||
db.sqlite3-journal
|
||||
|
||||
# Flask stuff:
|
||||
instance/
|
||||
.webassets-cache
|
||||
|
||||
# Scrapy stuff:
|
||||
.scrapy
|
||||
|
||||
# Sphinx documentation
|
||||
docs/_build/
|
||||
|
||||
# PyBuilder
|
||||
.pybuilder/
|
||||
target/
|
||||
|
||||
# Jupyter Notebook
|
||||
.ipynb_checkpoints
|
||||
|
||||
# IPython
|
||||
profile_default/
|
||||
ipython_config.py
|
||||
|
||||
# pyenv
|
||||
# For a library or package, you might want to ignore these files since the code is
|
||||
# intended to run in multiple environments; otherwise, check them in:
|
||||
# .python-version
|
||||
|
||||
# pipenv
|
||||
# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
|
||||
# However, in case of collaboration, if having platform-specific dependencies or dependencies
|
||||
# having no cross-platform support, pipenv may install dependencies that don't work, or not
|
||||
# install all needed dependencies.
|
||||
#Pipfile.lock
|
||||
|
||||
# poetry
|
||||
# Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
|
||||
# This is especially recommended for binary packages to ensure reproducibility, and is more
|
||||
# commonly ignored for libraries.
|
||||
# https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
|
||||
#poetry.lock
|
||||
|
||||
# pdm
|
||||
# Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
|
||||
#pdm.lock
|
||||
# pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
|
||||
# in version control.
|
||||
# https://pdm.fming.dev/#use-with-ide
|
||||
.pdm.toml
|
||||
|
||||
# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
|
||||
__pypackages__/
|
||||
|
||||
# Celery stuff
|
||||
celerybeat-schedule
|
||||
celerybeat.pid
|
||||
|
||||
# SageMath parsed files
|
||||
*.sage.py
|
||||
|
||||
# Environments
|
||||
.env
|
||||
.venv
|
||||
env/
|
||||
venv/
|
||||
ENV/
|
||||
env.bak/
|
||||
venv.bak/
|
||||
|
||||
# Spyder project settings
|
||||
.spyderproject
|
||||
.spyproject
|
||||
|
||||
# Rope project settings
|
||||
.ropeproject
|
||||
|
||||
# mkdocs documentation
|
||||
/site
|
||||
|
||||
# mypy
|
||||
.mypy_cache/
|
||||
.dmypy.json
|
||||
dmypy.json
|
||||
|
||||
# Pyre type checker
|
||||
.pyre/
|
||||
|
||||
# pytype static type analyzer
|
||||
.pytype/
|
||||
|
||||
# Cython debug symbols
|
||||
cython_debug/
|
||||
|
||||
# PyCharm
|
||||
# JetBrains specific template is maintained in a separate JetBrains.gitignore that can
|
||||
# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
|
||||
# and can be added to the global gitignore or merged into this file. For a more nuclear
|
||||
# option (not recommended) you can uncomment the following to ignore the entire idea folder.
|
||||
#.idea
|
||||
*.pt
|
||||
*.pth
|
||||
*.ckpt
|
||||
*.bin
|
||||
*.safetensors
|
||||
|
||||
# Editor setting metadata
|
||||
.idea/
|
||||
.vscode/
|
||||
detected_maps/
|
||||
annotator/downloads/
|
||||
|
||||
# test results and expectations
|
||||
web_tests/results/
|
||||
web_tests/expectations/
|
||||
tests/web_api/full_coverage/results/
|
||||
tests/web_api/full_coverage/expectations/
|
||||
|
||||
*_diff.png
|
||||
|
||||
# Presets
|
||||
presets/
|
||||
|
||||
# Ignore existing dir of hand refiner if exists.
|
||||
annotator/hand_refiner_portable
|
||||
674
extensions-builtin/forge_legacy_preprocessors/LICENSE
Executable file
674
extensions-builtin/forge_legacy_preprocessors/LICENSE
Executable file
@@ -0,0 +1,674 @@
|
||||
GNU GENERAL PUBLIC LICENSE
|
||||
Version 3, 29 June 2007
|
||||
|
||||
Copyright (C) 2007 Free Software Foundation, Inc. <https://fsf.org/>
|
||||
Everyone is permitted to copy and distribute verbatim copies
|
||||
of this license document, but changing it is not allowed.
|
||||
|
||||
Preamble
|
||||
|
||||
The GNU General Public License is a free, copyleft license for
|
||||
software and other kinds of works.
|
||||
|
||||
The licenses for most software and other practical works are designed
|
||||
to take away your freedom to share and change the works. By contrast,
|
||||
the GNU General Public License is intended to guarantee your freedom to
|
||||
share and change all versions of a program--to make sure it remains free
|
||||
software for all its users. We, the Free Software Foundation, use the
|
||||
GNU General Public License for most of our software; it applies also to
|
||||
any other work released this way by its authors. You can apply it to
|
||||
your programs, too.
|
||||
|
||||
When we speak of free software, we are referring to freedom, not
|
||||
price. Our General Public Licenses are designed to make sure that you
|
||||
have the freedom to distribute copies of free software (and charge for
|
||||
them if you wish), that you receive source code or can get it if you
|
||||
want it, that you can change the software or use pieces of it in new
|
||||
free programs, and that you know you can do these things.
|
||||
|
||||
To protect your rights, we need to prevent others from denying you
|
||||
these rights or asking you to surrender the rights. Therefore, you have
|
||||
certain responsibilities if you distribute copies of the software, or if
|
||||
you modify it: responsibilities to respect the freedom of others.
|
||||
|
||||
For example, if you distribute copies of such a program, whether
|
||||
gratis or for a fee, you must pass on to the recipients the same
|
||||
freedoms that you received. You must make sure that they, too, receive
|
||||
or can get the source code. And you must show them these terms so they
|
||||
know their rights.
|
||||
|
||||
Developers that use the GNU GPL protect your rights with two steps:
|
||||
(1) assert copyright on the software, and (2) offer you this License
|
||||
giving you legal permission to copy, distribute and/or modify it.
|
||||
|
||||
For the developers' and authors' protection, the GPL clearly explains
|
||||
that there is no warranty for this free software. For both users' and
|
||||
authors' sake, the GPL requires that modified versions be marked as
|
||||
changed, so that their problems will not be attributed erroneously to
|
||||
authors of previous versions.
|
||||
|
||||
Some devices are designed to deny users access to install or run
|
||||
modified versions of the software inside them, although the manufacturer
|
||||
can do so. This is fundamentally incompatible with the aim of
|
||||
protecting users' freedom to change the software. The systematic
|
||||
pattern of such abuse occurs in the area of products for individuals to
|
||||
use, which is precisely where it is most unacceptable. Therefore, we
|
||||
have designed this version of the GPL to prohibit the practice for those
|
||||
products. If such problems arise substantially in other domains, we
|
||||
stand ready to extend this provision to those domains in future versions
|
||||
of the GPL, as needed to protect the freedom of users.
|
||||
|
||||
Finally, every program is threatened constantly by software patents.
|
||||
States should not allow patents to restrict development and use of
|
||||
software on general-purpose computers, but in those that do, we wish to
|
||||
avoid the special danger that patents applied to a free program could
|
||||
make it effectively proprietary. To prevent this, the GPL assures that
|
||||
patents cannot be used to render the program non-free.
|
||||
|
||||
The precise terms and conditions for copying, distribution and
|
||||
modification follow.
|
||||
|
||||
TERMS AND CONDITIONS
|
||||
|
||||
0. Definitions.
|
||||
|
||||
"This License" refers to version 3 of the GNU General Public License.
|
||||
|
||||
"Copyright" also means copyright-like laws that apply to other kinds of
|
||||
works, such as semiconductor masks.
|
||||
|
||||
"The Program" refers to any copyrightable work licensed under this
|
||||
License. Each licensee is addressed as "you". "Licensees" and
|
||||
"recipients" may be individuals or organizations.
|
||||
|
||||
To "modify" a work means to copy from or adapt all or part of the work
|
||||
in a fashion requiring copyright permission, other than the making of an
|
||||
exact copy. The resulting work is called a "modified version" of the
|
||||
earlier work or a work "based on" the earlier work.
|
||||
|
||||
A "covered work" means either the unmodified Program or a work based
|
||||
on the Program.
|
||||
|
||||
To "propagate" a work means to do anything with it that, without
|
||||
permission, would make you directly or secondarily liable for
|
||||
infringement under applicable copyright law, except executing it on a
|
||||
computer or modifying a private copy. Propagation includes copying,
|
||||
distribution (with or without modification), making available to the
|
||||
public, and in some countries other activities as well.
|
||||
|
||||
To "convey" a work means any kind of propagation that enables other
|
||||
parties to make or receive copies. Mere interaction with a user through
|
||||
a computer network, with no transfer of a copy, is not conveying.
|
||||
|
||||
An interactive user interface displays "Appropriate Legal Notices"
|
||||
to the extent that it includes a convenient and prominently visible
|
||||
feature that (1) displays an appropriate copyright notice, and (2)
|
||||
tells the user that there is no warranty for the work (except to the
|
||||
extent that warranties are provided), that licensees may convey the
|
||||
work under this License, and how to view a copy of this License. If
|
||||
the interface presents a list of user commands or options, such as a
|
||||
menu, a prominent item in the list meets this criterion.
|
||||
|
||||
1. Source Code.
|
||||
|
||||
The "source code" for a work means the preferred form of the work
|
||||
for making modifications to it. "Object code" means any non-source
|
||||
form of a work.
|
||||
|
||||
A "Standard Interface" means an interface that either is an official
|
||||
standard defined by a recognized standards body, or, in the case of
|
||||
interfaces specified for a particular programming language, one that
|
||||
is widely used among developers working in that language.
|
||||
|
||||
The "System Libraries" of an executable work include anything, other
|
||||
than the work as a whole, that (a) is included in the normal form of
|
||||
packaging a Major Component, but which is not part of that Major
|
||||
Component, and (b) serves only to enable use of the work with that
|
||||
Major Component, or to implement a Standard Interface for which an
|
||||
implementation is available to the public in source code form. A
|
||||
"Major Component", in this context, means a major essential component
|
||||
(kernel, window system, and so on) of the specific operating system
|
||||
(if any) on which the executable work runs, or a compiler used to
|
||||
produce the work, or an object code interpreter used to run it.
|
||||
|
||||
The "Corresponding Source" for a work in object code form means all
|
||||
the source code needed to generate, install, and (for an executable
|
||||
work) run the object code and to modify the work, including scripts to
|
||||
control those activities. However, it does not include the work's
|
||||
System Libraries, or general-purpose tools or generally available free
|
||||
programs which are used unmodified in performing those activities but
|
||||
which are not part of the work. For example, Corresponding Source
|
||||
includes interface definition files associated with source files for
|
||||
the work, and the source code for shared libraries and dynamically
|
||||
linked subprograms that the work is specifically designed to require,
|
||||
such as by intimate data communication or control flow between those
|
||||
subprograms and other parts of the work.
|
||||
|
||||
The Corresponding Source need not include anything that users
|
||||
can regenerate automatically from other parts of the Corresponding
|
||||
Source.
|
||||
|
||||
The Corresponding Source for a work in source code form is that
|
||||
same work.
|
||||
|
||||
2. Basic Permissions.
|
||||
|
||||
All rights granted under this License are granted for the term of
|
||||
copyright on the Program, and are irrevocable provided the stated
|
||||
conditions are met. This License explicitly affirms your unlimited
|
||||
permission to run the unmodified Program. The output from running a
|
||||
covered work is covered by this License only if the output, given its
|
||||
content, constitutes a covered work. This License acknowledges your
|
||||
rights of fair use or other equivalent, as provided by copyright law.
|
||||
|
||||
You may make, run and propagate covered works that you do not
|
||||
convey, without conditions so long as your license otherwise remains
|
||||
in force. You may convey covered works to others for the sole purpose
|
||||
of having them make modifications exclusively for you, or provide you
|
||||
with facilities for running those works, provided that you comply with
|
||||
the terms of this License in conveying all material for which you do
|
||||
not control copyright. Those thus making or running the covered works
|
||||
for you must do so exclusively on your behalf, under your direction
|
||||
and control, on terms that prohibit them from making any copies of
|
||||
your copyrighted material outside their relationship with you.
|
||||
|
||||
Conveying under any other circumstances is permitted solely under
|
||||
the conditions stated below. Sublicensing is not allowed; section 10
|
||||
makes it unnecessary.
|
||||
|
||||
3. Protecting Users' Legal Rights From Anti-Circumvention Law.
|
||||
|
||||
No covered work shall be deemed part of an effective technological
|
||||
measure under any applicable law fulfilling obligations under article
|
||||
11 of the WIPO copyright treaty adopted on 20 December 1996, or
|
||||
similar laws prohibiting or restricting circumvention of such
|
||||
measures.
|
||||
|
||||
When you convey a covered work, you waive any legal power to forbid
|
||||
circumvention of technological measures to the extent such circumvention
|
||||
is effected by exercising rights under this License with respect to
|
||||
the covered work, and you disclaim any intention to limit operation or
|
||||
modification of the work as a means of enforcing, against the work's
|
||||
users, your or third parties' legal rights to forbid circumvention of
|
||||
technological measures.
|
||||
|
||||
4. Conveying Verbatim Copies.
|
||||
|
||||
You may convey verbatim copies of the Program's source code as you
|
||||
receive it, in any medium, provided that you conspicuously and
|
||||
appropriately publish on each copy an appropriate copyright notice;
|
||||
keep intact all notices stating that this License and any
|
||||
non-permissive terms added in accord with section 7 apply to the code;
|
||||
keep intact all notices of the absence of any warranty; and give all
|
||||
recipients a copy of this License along with the Program.
|
||||
|
||||
You may charge any price or no price for each copy that you convey,
|
||||
and you may offer support or warranty protection for a fee.
|
||||
|
||||
5. Conveying Modified Source Versions.
|
||||
|
||||
You may convey a work based on the Program, or the modifications to
|
||||
produce it from the Program, in the form of source code under the
|
||||
terms of section 4, provided that you also meet all of these conditions:
|
||||
|
||||
a) The work must carry prominent notices stating that you modified
|
||||
it, and giving a relevant date.
|
||||
|
||||
b) The work must carry prominent notices stating that it is
|
||||
released under this License and any conditions added under section
|
||||
7. This requirement modifies the requirement in section 4 to
|
||||
"keep intact all notices".
|
||||
|
||||
c) You must license the entire work, as a whole, under this
|
||||
License to anyone who comes into possession of a copy. This
|
||||
License will therefore apply, along with any applicable section 7
|
||||
additional terms, to the whole of the work, and all its parts,
|
||||
regardless of how they are packaged. This License gives no
|
||||
permission to license the work in any other way, but it does not
|
||||
invalidate such permission if you have separately received it.
|
||||
|
||||
d) If the work has interactive user interfaces, each must display
|
||||
Appropriate Legal Notices; however, if the Program has interactive
|
||||
interfaces that do not display Appropriate Legal Notices, your
|
||||
work need not make them do so.
|
||||
|
||||
A compilation of a covered work with other separate and independent
|
||||
works, which are not by their nature extensions of the covered work,
|
||||
and which are not combined with it such as to form a larger program,
|
||||
in or on a volume of a storage or distribution medium, is called an
|
||||
"aggregate" if the compilation and its resulting copyright are not
|
||||
used to limit the access or legal rights of the compilation's users
|
||||
beyond what the individual works permit. Inclusion of a covered work
|
||||
in an aggregate does not cause this License to apply to the other
|
||||
parts of the aggregate.
|
||||
|
||||
6. Conveying Non-Source Forms.
|
||||
|
||||
You may convey a covered work in object code form under the terms
|
||||
of sections 4 and 5, provided that you also convey the
|
||||
machine-readable Corresponding Source under the terms of this License,
|
||||
in one of these ways:
|
||||
|
||||
a) Convey the object code in, or embodied in, a physical product
|
||||
(including a physical distribution medium), accompanied by the
|
||||
Corresponding Source fixed on a durable physical medium
|
||||
customarily used for software interchange.
|
||||
|
||||
b) Convey the object code in, or embodied in, a physical product
|
||||
(including a physical distribution medium), accompanied by a
|
||||
written offer, valid for at least three years and valid for as
|
||||
long as you offer spare parts or customer support for that product
|
||||
model, to give anyone who possesses the object code either (1) a
|
||||
copy of the Corresponding Source for all the software in the
|
||||
product that is covered by this License, on a durable physical
|
||||
medium customarily used for software interchange, for a price no
|
||||
more than your reasonable cost of physically performing this
|
||||
conveying of source, or (2) access to copy the
|
||||
Corresponding Source from a network server at no charge.
|
||||
|
||||
c) Convey individual copies of the object code with a copy of the
|
||||
written offer to provide the Corresponding Source. This
|
||||
alternative is allowed only occasionally and noncommercially, and
|
||||
only if you received the object code with such an offer, in accord
|
||||
with subsection 6b.
|
||||
|
||||
d) Convey the object code by offering access from a designated
|
||||
place (gratis or for a charge), and offer equivalent access to the
|
||||
Corresponding Source in the same way through the same place at no
|
||||
further charge. You need not require recipients to copy the
|
||||
Corresponding Source along with the object code. If the place to
|
||||
copy the object code is a network server, the Corresponding Source
|
||||
may be on a different server (operated by you or a third party)
|
||||
that supports equivalent copying facilities, provided you maintain
|
||||
clear directions next to the object code saying where to find the
|
||||
Corresponding Source. Regardless of what server hosts the
|
||||
Corresponding Source, you remain obligated to ensure that it is
|
||||
available for as long as needed to satisfy these requirements.
|
||||
|
||||
e) Convey the object code using peer-to-peer transmission, provided
|
||||
you inform other peers where the object code and Corresponding
|
||||
Source of the work are being offered to the general public at no
|
||||
charge under subsection 6d.
|
||||
|
||||
A separable portion of the object code, whose source code is excluded
|
||||
from the Corresponding Source as a System Library, need not be
|
||||
included in conveying the object code work.
|
||||
|
||||
A "User Product" is either (1) a "consumer product", which means any
|
||||
tangible personal property which is normally used for personal, family,
|
||||
or household purposes, or (2) anything designed or sold for incorporation
|
||||
into a dwelling. In determining whether a product is a consumer product,
|
||||
doubtful cases shall be resolved in favor of coverage. For a particular
|
||||
product received by a particular user, "normally used" refers to a
|
||||
typical or common use of that class of product, regardless of the status
|
||||
of the particular user or of the way in which the particular user
|
||||
actually uses, or expects or is expected to use, the product. A product
|
||||
is a consumer product regardless of whether the product has substantial
|
||||
commercial, industrial or non-consumer uses, unless such uses represent
|
||||
the only significant mode of use of the product.
|
||||
|
||||
"Installation Information" for a User Product means any methods,
|
||||
procedures, authorization keys, or other information required to install
|
||||
and execute modified versions of a covered work in that User Product from
|
||||
a modified version of its Corresponding Source. The information must
|
||||
suffice to ensure that the continued functioning of the modified object
|
||||
code is in no case prevented or interfered with solely because
|
||||
modification has been made.
|
||||
|
||||
If you convey an object code work under this section in, or with, or
|
||||
specifically for use in, a User Product, and the conveying occurs as
|
||||
part of a transaction in which the right of possession and use of the
|
||||
User Product is transferred to the recipient in perpetuity or for a
|
||||
fixed term (regardless of how the transaction is characterized), the
|
||||
Corresponding Source conveyed under this section must be accompanied
|
||||
by the Installation Information. But this requirement does not apply
|
||||
if neither you nor any third party retains the ability to install
|
||||
modified object code on the User Product (for example, the work has
|
||||
been installed in ROM).
|
||||
|
||||
The requirement to provide Installation Information does not include a
|
||||
requirement to continue to provide support service, warranty, or updates
|
||||
for a work that has been modified or installed by the recipient, or for
|
||||
the User Product in which it has been modified or installed. Access to a
|
||||
network may be denied when the modification itself materially and
|
||||
adversely affects the operation of the network or violates the rules and
|
||||
protocols for communication across the network.
|
||||
|
||||
Corresponding Source conveyed, and Installation Information provided,
|
||||
in accord with this section must be in a format that is publicly
|
||||
documented (and with an implementation available to the public in
|
||||
source code form), and must require no special password or key for
|
||||
unpacking, reading or copying.
|
||||
|
||||
7. Additional Terms.
|
||||
|
||||
"Additional permissions" are terms that supplement the terms of this
|
||||
License by making exceptions from one or more of its conditions.
|
||||
Additional permissions that are applicable to the entire Program shall
|
||||
be treated as though they were included in this License, to the extent
|
||||
that they are valid under applicable law. If additional permissions
|
||||
apply only to part of the Program, that part may be used separately
|
||||
under those permissions, but the entire Program remains governed by
|
||||
this License without regard to the additional permissions.
|
||||
|
||||
When you convey a copy of a covered work, you may at your option
|
||||
remove any additional permissions from that copy, or from any part of
|
||||
it. (Additional permissions may be written to require their own
|
||||
removal in certain cases when you modify the work.) You may place
|
||||
additional permissions on material, added by you to a covered work,
|
||||
for which you have or can give appropriate copyright permission.
|
||||
|
||||
Notwithstanding any other provision of this License, for material you
|
||||
add to a covered work, you may (if authorized by the copyright holders of
|
||||
that material) supplement the terms of this License with terms:
|
||||
|
||||
a) Disclaiming warranty or limiting liability differently from the
|
||||
terms of sections 15 and 16 of this License; or
|
||||
|
||||
b) Requiring preservation of specified reasonable legal notices or
|
||||
author attributions in that material or in the Appropriate Legal
|
||||
Notices displayed by works containing it; or
|
||||
|
||||
c) Prohibiting misrepresentation of the origin of that material, or
|
||||
requiring that modified versions of such material be marked in
|
||||
reasonable ways as different from the original version; or
|
||||
|
||||
d) Limiting the use for publicity purposes of names of licensors or
|
||||
authors of the material; or
|
||||
|
||||
e) Declining to grant rights under trademark law for use of some
|
||||
trade names, trademarks, or service marks; or
|
||||
|
||||
f) Requiring indemnification of licensors and authors of that
|
||||
material by anyone who conveys the material (or modified versions of
|
||||
it) with contractual assumptions of liability to the recipient, for
|
||||
any liability that these contractual assumptions directly impose on
|
||||
those licensors and authors.
|
||||
|
||||
All other non-permissive additional terms are considered "further
|
||||
restrictions" within the meaning of section 10. If the Program as you
|
||||
received it, or any part of it, contains a notice stating that it is
|
||||
governed by this License along with a term that is a further
|
||||
restriction, you may remove that term. If a license document contains
|
||||
a further restriction but permits relicensing or conveying under this
|
||||
License, you may add to a covered work material governed by the terms
|
||||
of that license document, provided that the further restriction does
|
||||
not survive such relicensing or conveying.
|
||||
|
||||
If you add terms to a covered work in accord with this section, you
|
||||
must place, in the relevant source files, a statement of the
|
||||
additional terms that apply to those files, or a notice indicating
|
||||
where to find the applicable terms.
|
||||
|
||||
Additional terms, permissive or non-permissive, may be stated in the
|
||||
form of a separately written license, or stated as exceptions;
|
||||
the above requirements apply either way.
|
||||
|
||||
8. Termination.
|
||||
|
||||
You may not propagate or modify a covered work except as expressly
|
||||
provided under this License. Any attempt otherwise to propagate or
|
||||
modify it is void, and will automatically terminate your rights under
|
||||
this License (including any patent licenses granted under the third
|
||||
paragraph of section 11).
|
||||
|
||||
However, if you cease all violation of this License, then your
|
||||
license from a particular copyright holder is reinstated (a)
|
||||
provisionally, unless and until the copyright holder explicitly and
|
||||
finally terminates your license, and (b) permanently, if the copyright
|
||||
holder fails to notify you of the violation by some reasonable means
|
||||
prior to 60 days after the cessation.
|
||||
|
||||
Moreover, your license from a particular copyright holder is
|
||||
reinstated permanently if the copyright holder notifies you of the
|
||||
violation by some reasonable means, this is the first time you have
|
||||
received notice of violation of this License (for any work) from that
|
||||
copyright holder, and you cure the violation prior to 30 days after
|
||||
your receipt of the notice.
|
||||
|
||||
Termination of your rights under this section does not terminate the
|
||||
licenses of parties who have received copies or rights from you under
|
||||
this License. If your rights have been terminated and not permanently
|
||||
reinstated, you do not qualify to receive new licenses for the same
|
||||
material under section 10.
|
||||
|
||||
9. Acceptance Not Required for Having Copies.
|
||||
|
||||
You are not required to accept this License in order to receive or
|
||||
run a copy of the Program. Ancillary propagation of a covered work
|
||||
occurring solely as a consequence of using peer-to-peer transmission
|
||||
to receive a copy likewise does not require acceptance. However,
|
||||
nothing other than this License grants you permission to propagate or
|
||||
modify any covered work. These actions infringe copyright if you do
|
||||
not accept this License. Therefore, by modifying or propagating a
|
||||
covered work, you indicate your acceptance of this License to do so.
|
||||
|
||||
10. Automatic Licensing of Downstream Recipients.
|
||||
|
||||
Each time you convey a covered work, the recipient automatically
|
||||
receives a license from the original licensors, to run, modify and
|
||||
propagate that work, subject to this License. You are not responsible
|
||||
for enforcing compliance by third parties with this License.
|
||||
|
||||
An "entity transaction" is a transaction transferring control of an
|
||||
organization, or substantially all assets of one, or subdividing an
|
||||
organization, or merging organizations. If propagation of a covered
|
||||
work results from an entity transaction, each party to that
|
||||
transaction who receives a copy of the work also receives whatever
|
||||
licenses to the work the party's predecessor in interest had or could
|
||||
give under the previous paragraph, plus a right to possession of the
|
||||
Corresponding Source of the work from the predecessor in interest, if
|
||||
the predecessor has it or can get it with reasonable efforts.
|
||||
|
||||
You may not impose any further restrictions on the exercise of the
|
||||
rights granted or affirmed under this License. For example, you may
|
||||
not impose a license fee, royalty, or other charge for exercise of
|
||||
rights granted under this License, and you may not initiate litigation
|
||||
(including a cross-claim or counterclaim in a lawsuit) alleging that
|
||||
any patent claim is infringed by making, using, selling, offering for
|
||||
sale, or importing the Program or any portion of it.
|
||||
|
||||
11. Patents.
|
||||
|
||||
A "contributor" is a copyright holder who authorizes use under this
|
||||
License of the Program or a work on which the Program is based. The
|
||||
work thus licensed is called the contributor's "contributor version".
|
||||
|
||||
A contributor's "essential patent claims" are all patent claims
|
||||
owned or controlled by the contributor, whether already acquired or
|
||||
hereafter acquired, that would be infringed by some manner, permitted
|
||||
by this License, of making, using, or selling its contributor version,
|
||||
but do not include claims that would be infringed only as a
|
||||
consequence of further modification of the contributor version. For
|
||||
purposes of this definition, "control" includes the right to grant
|
||||
patent sublicenses in a manner consistent with the requirements of
|
||||
this License.
|
||||
|
||||
Each contributor grants you a non-exclusive, worldwide, royalty-free
|
||||
patent license under the contributor's essential patent claims, to
|
||||
make, use, sell, offer for sale, import and otherwise run, modify and
|
||||
propagate the contents of its contributor version.
|
||||
|
||||
In the following three paragraphs, a "patent license" is any express
|
||||
agreement or commitment, however denominated, not to enforce a patent
|
||||
(such as an express permission to practice a patent or covenant not to
|
||||
sue for patent infringement). To "grant" such a patent license to a
|
||||
party means to make such an agreement or commitment not to enforce a
|
||||
patent against the party.
|
||||
|
||||
If you convey a covered work, knowingly relying on a patent license,
|
||||
and the Corresponding Source of the work is not available for anyone
|
||||
to copy, free of charge and under the terms of this License, through a
|
||||
publicly available network server or other readily accessible means,
|
||||
then you must either (1) cause the Corresponding Source to be so
|
||||
available, or (2) arrange to deprive yourself of the benefit of the
|
||||
patent license for this particular work, or (3) arrange, in a manner
|
||||
consistent with the requirements of this License, to extend the patent
|
||||
license to downstream recipients. "Knowingly relying" means you have
|
||||
actual knowledge that, but for the patent license, your conveying the
|
||||
covered work in a country, or your recipient's use of the covered work
|
||||
in a country, would infringe one or more identifiable patents in that
|
||||
country that you have reason to believe are valid.
|
||||
|
||||
If, pursuant to or in connection with a single transaction or
|
||||
arrangement, you convey, or propagate by procuring conveyance of, a
|
||||
covered work, and grant a patent license to some of the parties
|
||||
receiving the covered work authorizing them to use, propagate, modify
|
||||
or convey a specific copy of the covered work, then the patent license
|
||||
you grant is automatically extended to all recipients of the covered
|
||||
work and works based on it.
|
||||
|
||||
A patent license is "discriminatory" if it does not include within
|
||||
the scope of its coverage, prohibits the exercise of, or is
|
||||
conditioned on the non-exercise of one or more of the rights that are
|
||||
specifically granted under this License. You may not convey a covered
|
||||
work if you are a party to an arrangement with a third party that is
|
||||
in the business of distributing software, under which you make payment
|
||||
to the third party based on the extent of your activity of conveying
|
||||
the work, and under which the third party grants, to any of the
|
||||
parties who would receive the covered work from you, a discriminatory
|
||||
patent license (a) in connection with copies of the covered work
|
||||
conveyed by you (or copies made from those copies), or (b) primarily
|
||||
for and in connection with specific products or compilations that
|
||||
contain the covered work, unless you entered into that arrangement,
|
||||
or that patent license was granted, prior to 28 March 2007.
|
||||
|
||||
Nothing in this License shall be construed as excluding or limiting
|
||||
any implied license or other defenses to infringement that may
|
||||
otherwise be available to you under applicable patent law.
|
||||
|
||||
12. No Surrender of Others' Freedom.
|
||||
|
||||
If conditions are imposed on you (whether by court order, agreement or
|
||||
otherwise) that contradict the conditions of this License, they do not
|
||||
excuse you from the conditions of this License. If you cannot convey a
|
||||
covered work so as to satisfy simultaneously your obligations under this
|
||||
License and any other pertinent obligations, then as a consequence you may
|
||||
not convey it at all. For example, if you agree to terms that obligate you
|
||||
to collect a royalty for further conveying from those to whom you convey
|
||||
the Program, the only way you could satisfy both those terms and this
|
||||
License would be to refrain entirely from conveying the Program.
|
||||
|
||||
13. Use with the GNU Affero General Public License.
|
||||
|
||||
Notwithstanding any other provision of this License, you have
|
||||
permission to link or combine any covered work with a work licensed
|
||||
under version 3 of the GNU Affero General Public License into a single
|
||||
combined work, and to convey the resulting work. The terms of this
|
||||
License will continue to apply to the part which is the covered work,
|
||||
but the special requirements of the GNU Affero General Public License,
|
||||
section 13, concerning interaction through a network will apply to the
|
||||
combination as such.
|
||||
|
||||
14. Revised Versions of this License.
|
||||
|
||||
The Free Software Foundation may publish revised and/or new versions of
|
||||
the GNU General Public License from time to time. Such new versions will
|
||||
be similar in spirit to the present version, but may differ in detail to
|
||||
address new problems or concerns.
|
||||
|
||||
Each version is given a distinguishing version number. If the
|
||||
Program specifies that a certain numbered version of the GNU General
|
||||
Public License "or any later version" applies to it, you have the
|
||||
option of following the terms and conditions either of that numbered
|
||||
version or of any later version published by the Free Software
|
||||
Foundation. If the Program does not specify a version number of the
|
||||
GNU General Public License, you may choose any version ever published
|
||||
by the Free Software Foundation.
|
||||
|
||||
If the Program specifies that a proxy can decide which future
|
||||
versions of the GNU General Public License can be used, that proxy's
|
||||
public statement of acceptance of a version permanently authorizes you
|
||||
to choose that version for the Program.
|
||||
|
||||
Later license versions may give you additional or different
|
||||
permissions. However, no additional obligations are imposed on any
|
||||
author or copyright holder as a result of your choosing to follow a
|
||||
later version.
|
||||
|
||||
15. Disclaimer of Warranty.
|
||||
|
||||
THERE IS NO WARRANTY FOR THE PROGRAM, TO THE EXTENT PERMITTED BY
|
||||
APPLICABLE LAW. EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT
|
||||
HOLDERS AND/OR OTHER PARTIES PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY
|
||||
OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO,
|
||||
THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
|
||||
PURPOSE. THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE PROGRAM
|
||||
IS WITH YOU. SHOULD THE PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF
|
||||
ALL NECESSARY SERVICING, REPAIR OR CORRECTION.
|
||||
|
||||
16. Limitation of Liability.
|
||||
|
||||
IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING
|
||||
WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MODIFIES AND/OR CONVEYS
|
||||
THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, INCLUDING ANY
|
||||
GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING OUT OF THE
|
||||
USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED TO LOSS OF
|
||||
DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD
|
||||
PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER PROGRAMS),
|
||||
EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF
|
||||
SUCH DAMAGES.
|
||||
|
||||
17. Interpretation of Sections 15 and 16.
|
||||
|
||||
If the disclaimer of warranty and limitation of liability provided
|
||||
above cannot be given local legal effect according to their terms,
|
||||
reviewing courts shall apply local law that most closely approximates
|
||||
an absolute waiver of all civil liability in connection with the
|
||||
Program, unless a warranty or assumption of liability accompanies a
|
||||
copy of the Program in return for a fee.
|
||||
|
||||
END OF TERMS AND CONDITIONS
|
||||
|
||||
How to Apply These Terms to Your New Programs
|
||||
|
||||
If you develop a new program, and you want it to be of the greatest
|
||||
possible use to the public, the best way to achieve this is to make it
|
||||
free software which everyone can redistribute and change under these terms.
|
||||
|
||||
To do so, attach the following notices to the program. It is safest
|
||||
to attach them to the start of each source file to most effectively
|
||||
state the exclusion of warranty; and each file should have at least
|
||||
the "copyright" line and a pointer to where the full notice is found.
|
||||
|
||||
<one line to give the program's name and a brief idea of what it does.>
|
||||
Copyright (C) <year> <name of author>
|
||||
|
||||
This program is free software: you can redistribute it and/or modify
|
||||
it under the terms of the GNU General Public License as published by
|
||||
the Free Software Foundation, either version 3 of the License, or
|
||||
(at your option) any later version.
|
||||
|
||||
This program is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
GNU General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU General Public License
|
||||
along with this program. If not, see <https://www.gnu.org/licenses/>.
|
||||
|
||||
Also add information on how to contact you by electronic and paper mail.
|
||||
|
||||
If the program does terminal interaction, make it output a short
|
||||
notice like this when it starts in an interactive mode:
|
||||
|
||||
<program> Copyright (C) <year> <name of author>
|
||||
This program comes with ABSOLUTELY NO WARRANTY; for details type `show w'.
|
||||
This is free software, and you are welcome to redistribute it
|
||||
under certain conditions; type `show c' for details.
|
||||
|
||||
The hypothetical commands `show w' and `show c' should show the appropriate
|
||||
parts of the General Public License. Of course, your program's commands
|
||||
might be different; for a GUI interface, you would use an "about box".
|
||||
|
||||
You should also get your employer (if you work as a programmer) or school,
|
||||
if any, to sign a "copyright disclaimer" for the program, if necessary.
|
||||
For more information on this, and how to apply and follow the GNU GPL, see
|
||||
<https://www.gnu.org/licenses/>.
|
||||
|
||||
The GNU General Public License does not permit incorporating your program
|
||||
into proprietary programs. If your program is a subroutine library, you
|
||||
may consider it more useful to permit linking proprietary applications with
|
||||
the library. If this is what you want to do, use the GNU Lesser General
|
||||
Public License instead of this License. But first, please read
|
||||
<https://www.gnu.org/licenses/why-not-lgpl.html>.
|
||||
@@ -0,0 +1,21 @@
|
||||
MIT License
|
||||
|
||||
Copyright (c) 2021 Miaomiao Li
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
of this software and associated documentation files (the "Software"), to deal
|
||||
in the Software without restriction, including without limitation the rights
|
||||
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
copies of the Software, and to permit persons to whom the Software is
|
||||
furnished to do so, subject to the following conditions:
|
||||
|
||||
The above copyright notice and this permission notice shall be included in all
|
||||
copies or substantial portions of the Software.
|
||||
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
||||
SOFTWARE.
|
||||
@@ -0,0 +1,172 @@
|
||||
import os
|
||||
import torch
|
||||
import torch.nn as nn
|
||||
import torch.nn.functional as F
|
||||
from PIL import Image
|
||||
import fnmatch
|
||||
import cv2
|
||||
|
||||
import sys
|
||||
|
||||
import numpy as np
|
||||
from modules import devices
|
||||
from einops import rearrange
|
||||
from annotator.annotator_path import models_path
|
||||
|
||||
import torchvision
|
||||
from torchvision.models import MobileNet_V2_Weights
|
||||
from torchvision import transforms
|
||||
|
||||
COLOR_BACKGROUND = (255,255,0)
|
||||
COLOR_HAIR = (0,0,255)
|
||||
COLOR_EYE = (255,0,0)
|
||||
COLOR_MOUTH = (255,255,255)
|
||||
COLOR_FACE = (0,255,0)
|
||||
COLOR_SKIN = (0,255,255)
|
||||
COLOR_CLOTHES = (255,0,255)
|
||||
PALETTE = [COLOR_BACKGROUND,COLOR_HAIR,COLOR_EYE,COLOR_MOUTH,COLOR_FACE,COLOR_SKIN,COLOR_CLOTHES]
|
||||
|
||||
class UNet(nn.Module):
|
||||
def __init__(self):
|
||||
super(UNet, self).__init__()
|
||||
self.NUM_SEG_CLASSES = 7 # Background, hair, face, eye, mouth, skin, clothes
|
||||
|
||||
mobilenet_v2 = torchvision.models.mobilenet_v2(weights=MobileNet_V2_Weights.IMAGENET1K_V1)
|
||||
mob_blocks = mobilenet_v2.features
|
||||
|
||||
# Encoder
|
||||
self.en_block0 = nn.Sequential( # in_ch=3 out_ch=16
|
||||
mob_blocks[0],
|
||||
mob_blocks[1]
|
||||
)
|
||||
self.en_block1 = nn.Sequential( # in_ch=16 out_ch=24
|
||||
mob_blocks[2],
|
||||
mob_blocks[3],
|
||||
)
|
||||
self.en_block2 = nn.Sequential( # in_ch=24 out_ch=32
|
||||
mob_blocks[4],
|
||||
mob_blocks[5],
|
||||
mob_blocks[6],
|
||||
)
|
||||
self.en_block3 = nn.Sequential( # in_ch=32 out_ch=96
|
||||
mob_blocks[7],
|
||||
mob_blocks[8],
|
||||
mob_blocks[9],
|
||||
mob_blocks[10],
|
||||
mob_blocks[11],
|
||||
mob_blocks[12],
|
||||
mob_blocks[13],
|
||||
)
|
||||
self.en_block4 = nn.Sequential( # in_ch=96 out_ch=160
|
||||
mob_blocks[14],
|
||||
mob_blocks[15],
|
||||
mob_blocks[16],
|
||||
)
|
||||
|
||||
# Decoder
|
||||
self.de_block4 = nn.Sequential( # in_ch=160 out_ch=96
|
||||
nn.UpsamplingNearest2d(scale_factor=2),
|
||||
nn.Conv2d(160, 96, kernel_size=3, padding=1),
|
||||
nn.InstanceNorm2d(96),
|
||||
nn.LeakyReLU(0.1),
|
||||
nn.Dropout(p=0.2)
|
||||
)
|
||||
self.de_block3 = nn.Sequential( # in_ch=96x2 out_ch=32
|
||||
nn.UpsamplingNearest2d(scale_factor=2),
|
||||
nn.Conv2d(96*2, 32, kernel_size=3, padding=1),
|
||||
nn.InstanceNorm2d(32),
|
||||
nn.LeakyReLU(0.1),
|
||||
nn.Dropout(p=0.2)
|
||||
)
|
||||
self.de_block2 = nn.Sequential( # in_ch=32x2 out_ch=24
|
||||
nn.UpsamplingNearest2d(scale_factor=2),
|
||||
nn.Conv2d(32*2, 24, kernel_size=3, padding=1),
|
||||
nn.InstanceNorm2d(24),
|
||||
nn.LeakyReLU(0.1),
|
||||
nn.Dropout(p=0.2)
|
||||
)
|
||||
self.de_block1 = nn.Sequential( # in_ch=24x2 out_ch=16
|
||||
nn.UpsamplingNearest2d(scale_factor=2),
|
||||
nn.Conv2d(24*2, 16, kernel_size=3, padding=1),
|
||||
nn.InstanceNorm2d(16),
|
||||
nn.LeakyReLU(0.1),
|
||||
nn.Dropout(p=0.2)
|
||||
)
|
||||
|
||||
self.de_block0 = nn.Sequential( # in_ch=16x2 out_ch=7
|
||||
nn.UpsamplingNearest2d(scale_factor=2),
|
||||
nn.Conv2d(16*2, self.NUM_SEG_CLASSES, kernel_size=3, padding=1),
|
||||
nn.Softmax2d()
|
||||
)
|
||||
|
||||
def forward(self, x):
|
||||
e0 = self.en_block0(x)
|
||||
e1 = self.en_block1(e0)
|
||||
e2 = self.en_block2(e1)
|
||||
e3 = self.en_block3(e2)
|
||||
e4 = self.en_block4(e3)
|
||||
|
||||
d4 = self.de_block4(e4)
|
||||
d4 = F.interpolate(d4, size=e3.size()[2:], mode='bilinear', align_corners=True)
|
||||
c4 = torch.cat((d4,e3),1)
|
||||
|
||||
d3 = self.de_block3(c4)
|
||||
d3 = F.interpolate(d3, size=e2.size()[2:], mode='bilinear', align_corners=True)
|
||||
c3 = torch.cat((d3,e2),1)
|
||||
|
||||
d2 = self.de_block2(c3)
|
||||
d2 = F.interpolate(d2, size=e1.size()[2:], mode='bilinear', align_corners=True)
|
||||
c2 =torch.cat((d2,e1),1)
|
||||
|
||||
d1 = self.de_block1(c2)
|
||||
d1 = F.interpolate(d1, size=e0.size()[2:], mode='bilinear', align_corners=True)
|
||||
c1 = torch.cat((d1,e0),1)
|
||||
y = self.de_block0(c1)
|
||||
|
||||
return y
|
||||
|
||||
|
||||
class AnimeFaceSegment:
|
||||
|
||||
model_dir = os.path.join(models_path, "anime_face_segment")
|
||||
|
||||
def __init__(self):
|
||||
self.model = None
|
||||
self.device = devices.get_device_for("controlnet")
|
||||
|
||||
def load_model(self):
|
||||
remote_model_path = "https://huggingface.co/bdsqlsz/qinglong_controlnet-lllite/resolve/main/Annotators/UNet.pth"
|
||||
modelpath = os.path.join(self.model_dir, "UNet.pth")
|
||||
if not os.path.exists(modelpath):
|
||||
from modules.modelloader import load_file_from_url
|
||||
load_file_from_url(remote_model_path, model_dir=self.model_dir)
|
||||
net = UNet()
|
||||
ckpt = torch.load(modelpath, map_location=self.device)
|
||||
for key in list(ckpt.keys()):
|
||||
if 'module.' in key:
|
||||
ckpt[key.replace('module.', '')] = ckpt[key]
|
||||
del ckpt[key]
|
||||
net.load_state_dict(ckpt)
|
||||
net.eval()
|
||||
self.model = net.to(self.device)
|
||||
|
||||
def unload_model(self):
|
||||
if self.model is not None:
|
||||
self.model.cpu()
|
||||
|
||||
def __call__(self, input_image):
|
||||
|
||||
if self.model is None:
|
||||
self.load_model()
|
||||
self.model.to(self.device)
|
||||
transform = transforms.Compose([
|
||||
transforms.Resize(512,interpolation=transforms.InterpolationMode.BICUBIC),
|
||||
transforms.ToTensor(),])
|
||||
img = Image.fromarray(input_image)
|
||||
with torch.no_grad():
|
||||
img = transform(img).unsqueeze(dim=0).to(self.device)
|
||||
seg = self.model(img).squeeze(dim=0)
|
||||
seg = seg.cpu().detach().numpy()
|
||||
img = rearrange(seg,'h w c -> w c h')
|
||||
img = [[PALETTE[np.argmax(val)] for val in buf]for buf in img]
|
||||
return np.array(img).astype(np.uint8)
|
||||
11
extensions-builtin/forge_legacy_preprocessors/annotator/annotator_path.py
Executable file
11
extensions-builtin/forge_legacy_preprocessors/annotator/annotator_path.py
Executable file
@@ -0,0 +1,11 @@
|
||||
import os
|
||||
from modules_forge.shared import preprocessor_dir
|
||||
|
||||
|
||||
models_path = preprocessor_dir
|
||||
clip_vision_path = os.path.join(preprocessor_dir, 'clip_vision')
|
||||
|
||||
os.makedirs(models_path, exist_ok=True)
|
||||
os.makedirs(clip_vision_path, exist_ok=True)
|
||||
|
||||
print(f'ControlNet preprocessor location: {models_path}')
|
||||
14
extensions-builtin/forge_legacy_preprocessors/annotator/binary/__init__.py
Executable file
14
extensions-builtin/forge_legacy_preprocessors/annotator/binary/__init__.py
Executable file
@@ -0,0 +1,14 @@
|
||||
import cv2
|
||||
|
||||
|
||||
def apply_binary(img, bin_threshold):
|
||||
img_gray = cv2.cvtColor(img, cv2.COLOR_RGB2GRAY)
|
||||
|
||||
if bin_threshold == 0 or bin_threshold == 255:
|
||||
# Otsu's threshold
|
||||
otsu_threshold, img_bin = cv2.threshold(img_gray, 0, 255, cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU)
|
||||
print("Otsu threshold:", otsu_threshold)
|
||||
else:
|
||||
_, img_bin = cv2.threshold(img_gray, bin_threshold, 255, cv2.THRESH_BINARY_INV)
|
||||
|
||||
return cv2.cvtColor(img_bin, cv2.COLOR_GRAY2RGB)
|
||||
@@ -0,0 +1,5 @@
|
||||
import cv2
|
||||
|
||||
|
||||
def apply_canny(img, low_threshold, high_threshold):
|
||||
return cv2.Canny(img, low_threshold, high_threshold)
|
||||
20
extensions-builtin/forge_legacy_preprocessors/annotator/color/__init__.py
Executable file
20
extensions-builtin/forge_legacy_preprocessors/annotator/color/__init__.py
Executable file
@@ -0,0 +1,20 @@
|
||||
import cv2
|
||||
|
||||
def cv2_resize_shortest_edge(image, size):
|
||||
h, w = image.shape[:2]
|
||||
if h < w:
|
||||
new_h = size
|
||||
new_w = int(round(w / h * size))
|
||||
else:
|
||||
new_w = size
|
||||
new_h = int(round(h / w * size))
|
||||
resized_image = cv2.resize(image, (new_w, new_h), interpolation=cv2.INTER_AREA)
|
||||
return resized_image
|
||||
|
||||
def apply_color(img, res=512):
|
||||
img = cv2_resize_shortest_edge(img, res)
|
||||
h, w = img.shape[:2]
|
||||
|
||||
input_img_color = cv2.resize(img, (w//64, h//64), interpolation=cv2.INTER_CUBIC)
|
||||
input_img_color = cv2.resize(input_img_color, (w, h), interpolation=cv2.INTER_NEAREST)
|
||||
return input_img_color
|
||||
@@ -0,0 +1,57 @@
|
||||
import torchvision # Fix issue Unknown builtin op: torchvision::nms
|
||||
import cv2
|
||||
import numpy as np
|
||||
import torch
|
||||
from einops import rearrange
|
||||
from .densepose import DensePoseMaskedColormapResultsVisualizer, _extract_i_from_iuvarr, densepose_chart_predictor_output_to_result_with_confidences
|
||||
from modules import devices
|
||||
from annotator.annotator_path import models_path
|
||||
import os
|
||||
|
||||
N_PART_LABELS = 24
|
||||
result_visualizer = DensePoseMaskedColormapResultsVisualizer(
|
||||
alpha=1,
|
||||
data_extractor=_extract_i_from_iuvarr,
|
||||
segm_extractor=_extract_i_from_iuvarr,
|
||||
val_scale = 255.0 / N_PART_LABELS
|
||||
)
|
||||
remote_torchscript_path = "https://huggingface.co/LayerNorm/DensePose-TorchScript-with-hint-image/resolve/main/densepose_r50_fpn_dl.torchscript"
|
||||
torchscript_model = None
|
||||
model_dir = os.path.join(models_path, "densepose")
|
||||
|
||||
def apply_densepose(input_image, cmap="viridis"):
|
||||
global torchscript_model
|
||||
if torchscript_model is None:
|
||||
model_path = os.path.join(model_dir, "densepose_r50_fpn_dl.torchscript")
|
||||
if not os.path.exists(model_path):
|
||||
from modules.modelloader import load_file_from_url
|
||||
load_file_from_url(remote_torchscript_path, model_dir=model_dir)
|
||||
torchscript_model = torch.jit.load(model_path, map_location="cpu").to(devices.get_device_for("controlnet")).eval()
|
||||
H, W = input_image.shape[:2]
|
||||
|
||||
hint_image_canvas = np.zeros([H, W], dtype=np.uint8)
|
||||
hint_image_canvas = np.tile(hint_image_canvas[:, :, np.newaxis], [1, 1, 3])
|
||||
input_image = rearrange(torch.from_numpy(input_image).to(devices.get_device_for("controlnet")), 'h w c -> c h w')
|
||||
pred_boxes, corase_segm, fine_segm, u, v = torchscript_model(input_image)
|
||||
|
||||
extractor = densepose_chart_predictor_output_to_result_with_confidences
|
||||
densepose_results = [extractor(pred_boxes[i:i+1], corase_segm[i:i+1], fine_segm[i:i+1], u[i:i+1], v[i:i+1]) for i in range(len(pred_boxes))]
|
||||
|
||||
if cmap=="viridis":
|
||||
result_visualizer.mask_visualizer.cmap = cv2.COLORMAP_VIRIDIS
|
||||
hint_image = result_visualizer.visualize(hint_image_canvas, densepose_results)
|
||||
hint_image = cv2.cvtColor(hint_image, cv2.COLOR_BGR2RGB)
|
||||
hint_image[:, :, 0][hint_image[:, :, 0] == 0] = 68
|
||||
hint_image[:, :, 1][hint_image[:, :, 1] == 0] = 1
|
||||
hint_image[:, :, 2][hint_image[:, :, 2] == 0] = 84
|
||||
else:
|
||||
result_visualizer.mask_visualizer.cmap = cv2.COLORMAP_PARULA
|
||||
hint_image = result_visualizer.visualize(hint_image_canvas, densepose_results)
|
||||
hint_image = cv2.cvtColor(hint_image, cv2.COLOR_BGR2RGB)
|
||||
|
||||
return hint_image
|
||||
|
||||
def unload_model():
|
||||
global torchscript_model
|
||||
if torchscript_model is not None:
|
||||
torchscript_model.cpu()
|
||||
347
extensions-builtin/forge_legacy_preprocessors/annotator/densepose/densepose.py
Executable file
347
extensions-builtin/forge_legacy_preprocessors/annotator/densepose/densepose.py
Executable file
@@ -0,0 +1,347 @@
|
||||
from typing import Tuple
|
||||
import math
|
||||
import numpy as np
|
||||
from enum import IntEnum
|
||||
from typing import List, Tuple, Union
|
||||
import torch
|
||||
from torch.nn import functional as F
|
||||
import logging
|
||||
import cv2
|
||||
|
||||
Image = np.ndarray
|
||||
Boxes = torch.Tensor
|
||||
ImageSizeType = Tuple[int, int]
|
||||
_RawBoxType = Union[List[float], Tuple[float, ...], torch.Tensor, np.ndarray]
|
||||
IntTupleBox = Tuple[int, int, int, int]
|
||||
|
||||
class BoxMode(IntEnum):
|
||||
"""
|
||||
Enum of different ways to represent a box.
|
||||
"""
|
||||
|
||||
XYXY_ABS = 0
|
||||
"""
|
||||
(x0, y0, x1, y1) in absolute floating points coordinates.
|
||||
The coordinates in range [0, width or height].
|
||||
"""
|
||||
XYWH_ABS = 1
|
||||
"""
|
||||
(x0, y0, w, h) in absolute floating points coordinates.
|
||||
"""
|
||||
XYXY_REL = 2
|
||||
"""
|
||||
Not yet supported!
|
||||
(x0, y0, x1, y1) in range [0, 1]. They are relative to the size of the image.
|
||||
"""
|
||||
XYWH_REL = 3
|
||||
"""
|
||||
Not yet supported!
|
||||
(x0, y0, w, h) in range [0, 1]. They are relative to the size of the image.
|
||||
"""
|
||||
XYWHA_ABS = 4
|
||||
"""
|
||||
(xc, yc, w, h, a) in absolute floating points coordinates.
|
||||
(xc, yc) is the center of the rotated box, and the angle a is in degrees ccw.
|
||||
"""
|
||||
|
||||
@staticmethod
|
||||
def convert(box: _RawBoxType, from_mode: "BoxMode", to_mode: "BoxMode") -> _RawBoxType:
|
||||
"""
|
||||
Args:
|
||||
box: can be a k-tuple, k-list or an Nxk array/tensor, where k = 4 or 5
|
||||
from_mode, to_mode (BoxMode)
|
||||
|
||||
Returns:
|
||||
The converted box of the same type.
|
||||
"""
|
||||
if from_mode == to_mode:
|
||||
return box
|
||||
|
||||
original_type = type(box)
|
||||
is_numpy = isinstance(box, np.ndarray)
|
||||
single_box = isinstance(box, (list, tuple))
|
||||
if single_box:
|
||||
assert len(box) == 4 or len(box) == 5, (
|
||||
"BoxMode.convert takes either a k-tuple/list or an Nxk array/tensor,"
|
||||
" where k == 4 or 5"
|
||||
)
|
||||
arr = torch.tensor(box)[None, :]
|
||||
else:
|
||||
# avoid modifying the input box
|
||||
if is_numpy:
|
||||
arr = torch.from_numpy(np.asarray(box)).clone()
|
||||
else:
|
||||
arr = box.clone()
|
||||
|
||||
assert to_mode not in [BoxMode.XYXY_REL, BoxMode.XYWH_REL] and from_mode not in [
|
||||
BoxMode.XYXY_REL,
|
||||
BoxMode.XYWH_REL,
|
||||
], "Relative mode not yet supported!"
|
||||
|
||||
if from_mode == BoxMode.XYWHA_ABS and to_mode == BoxMode.XYXY_ABS:
|
||||
assert (
|
||||
arr.shape[-1] == 5
|
||||
), "The last dimension of input shape must be 5 for XYWHA format"
|
||||
original_dtype = arr.dtype
|
||||
arr = arr.double()
|
||||
|
||||
w = arr[:, 2]
|
||||
h = arr[:, 3]
|
||||
a = arr[:, 4]
|
||||
c = torch.abs(torch.cos(a * math.pi / 180.0))
|
||||
s = torch.abs(torch.sin(a * math.pi / 180.0))
|
||||
# This basically computes the horizontal bounding rectangle of the rotated box
|
||||
new_w = c * w + s * h
|
||||
new_h = c * h + s * w
|
||||
|
||||
# convert center to top-left corner
|
||||
arr[:, 0] -= new_w / 2.0
|
||||
arr[:, 1] -= new_h / 2.0
|
||||
# bottom-right corner
|
||||
arr[:, 2] = arr[:, 0] + new_w
|
||||
arr[:, 3] = arr[:, 1] + new_h
|
||||
|
||||
arr = arr[:, :4].to(dtype=original_dtype)
|
||||
elif from_mode == BoxMode.XYWH_ABS and to_mode == BoxMode.XYWHA_ABS:
|
||||
original_dtype = arr.dtype
|
||||
arr = arr.double()
|
||||
arr[:, 0] += arr[:, 2] / 2.0
|
||||
arr[:, 1] += arr[:, 3] / 2.0
|
||||
angles = torch.zeros((arr.shape[0], 1), dtype=arr.dtype)
|
||||
arr = torch.cat((arr, angles), axis=1).to(dtype=original_dtype)
|
||||
else:
|
||||
if to_mode == BoxMode.XYXY_ABS and from_mode == BoxMode.XYWH_ABS:
|
||||
arr[:, 2] += arr[:, 0]
|
||||
arr[:, 3] += arr[:, 1]
|
||||
elif from_mode == BoxMode.XYXY_ABS and to_mode == BoxMode.XYWH_ABS:
|
||||
arr[:, 2] -= arr[:, 0]
|
||||
arr[:, 3] -= arr[:, 1]
|
||||
else:
|
||||
raise NotImplementedError(
|
||||
"Conversion from BoxMode {} to {} is not supported yet".format(
|
||||
from_mode, to_mode
|
||||
)
|
||||
)
|
||||
|
||||
if single_box:
|
||||
return original_type(arr.flatten().tolist())
|
||||
if is_numpy:
|
||||
return arr.numpy()
|
||||
else:
|
||||
return arr
|
||||
|
||||
class MatrixVisualizer:
|
||||
"""
|
||||
Base visualizer for matrix data
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
inplace=True,
|
||||
cmap=cv2.COLORMAP_PARULA,
|
||||
val_scale=1.0,
|
||||
alpha=0.7,
|
||||
interp_method_matrix=cv2.INTER_LINEAR,
|
||||
interp_method_mask=cv2.INTER_NEAREST,
|
||||
):
|
||||
self.inplace = inplace
|
||||
self.cmap = cmap
|
||||
self.val_scale = val_scale
|
||||
self.alpha = alpha
|
||||
self.interp_method_matrix = interp_method_matrix
|
||||
self.interp_method_mask = interp_method_mask
|
||||
|
||||
def visualize(self, image_bgr, mask, matrix, bbox_xywh):
|
||||
self._check_image(image_bgr)
|
||||
self._check_mask_matrix(mask, matrix)
|
||||
if self.inplace:
|
||||
image_target_bgr = image_bgr
|
||||
else:
|
||||
image_target_bgr = image_bgr * 0
|
||||
x, y, w, h = [int(v) for v in bbox_xywh]
|
||||
if w <= 0 or h <= 0:
|
||||
return image_bgr
|
||||
mask, matrix = self._resize(mask, matrix, w, h)
|
||||
mask_bg = np.tile((mask == 0)[:, :, np.newaxis], [1, 1, 3])
|
||||
matrix_scaled = matrix.astype(np.float32) * self.val_scale
|
||||
_EPSILON = 1e-6
|
||||
if np.any(matrix_scaled > 255 + _EPSILON):
|
||||
logger = logging.getLogger(__name__)
|
||||
logger.warning(
|
||||
f"Matrix has values > {255 + _EPSILON} after " f"scaling, clipping to [0..255]"
|
||||
)
|
||||
matrix_scaled_8u = matrix_scaled.clip(0, 255).astype(np.uint8)
|
||||
matrix_vis = cv2.applyColorMap(matrix_scaled_8u, self.cmap)
|
||||
matrix_vis[mask_bg] = image_target_bgr[y : y + h, x : x + w, :][mask_bg]
|
||||
image_target_bgr[y : y + h, x : x + w, :] = (
|
||||
image_target_bgr[y : y + h, x : x + w, :] * (1.0 - self.alpha) + matrix_vis * self.alpha
|
||||
)
|
||||
return image_target_bgr.astype(np.uint8)
|
||||
|
||||
def _resize(self, mask, matrix, w, h):
|
||||
if (w != mask.shape[1]) or (h != mask.shape[0]):
|
||||
mask = cv2.resize(mask, (w, h), self.interp_method_mask)
|
||||
if (w != matrix.shape[1]) or (h != matrix.shape[0]):
|
||||
matrix = cv2.resize(matrix, (w, h), self.interp_method_matrix)
|
||||
return mask, matrix
|
||||
|
||||
def _check_image(self, image_rgb):
|
||||
assert len(image_rgb.shape) == 3
|
||||
assert image_rgb.shape[2] == 3
|
||||
assert image_rgb.dtype == np.uint8
|
||||
|
||||
def _check_mask_matrix(self, mask, matrix):
|
||||
assert len(matrix.shape) == 2
|
||||
assert len(mask.shape) == 2
|
||||
assert mask.dtype == np.uint8
|
||||
|
||||
class DensePoseResultsVisualizer:
|
||||
def visualize(
|
||||
self,
|
||||
image_bgr: Image,
|
||||
results,
|
||||
) -> Image:
|
||||
context = self.create_visualization_context(image_bgr)
|
||||
for i, result in enumerate(results):
|
||||
boxes_xywh, labels, uv = result
|
||||
iuv_array = torch.cat(
|
||||
(labels[None].type(torch.float32), uv * 255.0)
|
||||
).type(torch.uint8)
|
||||
self.visualize_iuv_arr(context, iuv_array.cpu().numpy(), boxes_xywh)
|
||||
image_bgr = self.context_to_image_bgr(context)
|
||||
return image_bgr
|
||||
|
||||
def create_visualization_context(self, image_bgr: Image):
|
||||
return image_bgr
|
||||
|
||||
def visualize_iuv_arr(self, context, iuv_arr: np.ndarray, bbox_xywh) -> None:
|
||||
pass
|
||||
|
||||
def context_to_image_bgr(self, context):
|
||||
return context
|
||||
|
||||
def get_image_bgr_from_context(self, context):
|
||||
return context
|
||||
|
||||
class DensePoseMaskedColormapResultsVisualizer(DensePoseResultsVisualizer):
|
||||
def __init__(
|
||||
self,
|
||||
data_extractor,
|
||||
segm_extractor,
|
||||
inplace=True,
|
||||
cmap=cv2.COLORMAP_PARULA,
|
||||
alpha=0.7,
|
||||
val_scale=1.0,
|
||||
**kwargs,
|
||||
):
|
||||
self.mask_visualizer = MatrixVisualizer(
|
||||
inplace=inplace, cmap=cmap, val_scale=val_scale, alpha=alpha
|
||||
)
|
||||
self.data_extractor = data_extractor
|
||||
self.segm_extractor = segm_extractor
|
||||
|
||||
def context_to_image_bgr(self, context):
|
||||
return context
|
||||
|
||||
def visualize_iuv_arr(self, context, iuv_arr: np.ndarray, bbox_xywh) -> None:
|
||||
image_bgr = self.get_image_bgr_from_context(context)
|
||||
matrix = self.data_extractor(iuv_arr)
|
||||
segm = self.segm_extractor(iuv_arr)
|
||||
mask = np.zeros(matrix.shape, dtype=np.uint8)
|
||||
mask[segm > 0] = 1
|
||||
image_bgr = self.mask_visualizer.visualize(image_bgr, mask, matrix, bbox_xywh)
|
||||
|
||||
|
||||
def _extract_i_from_iuvarr(iuv_arr):
|
||||
return iuv_arr[0, :, :]
|
||||
|
||||
|
||||
def _extract_u_from_iuvarr(iuv_arr):
|
||||
return iuv_arr[1, :, :]
|
||||
|
||||
|
||||
def _extract_v_from_iuvarr(iuv_arr):
|
||||
return iuv_arr[2, :, :]
|
||||
|
||||
def make_int_box(box: torch.Tensor) -> IntTupleBox:
|
||||
int_box = [0, 0, 0, 0]
|
||||
int_box[0], int_box[1], int_box[2], int_box[3] = tuple(box.long().tolist())
|
||||
return int_box[0], int_box[1], int_box[2], int_box[3]
|
||||
|
||||
def densepose_chart_predictor_output_to_result_with_confidences(
|
||||
boxes: Boxes,
|
||||
coarse_segm,
|
||||
fine_segm,
|
||||
u, v
|
||||
|
||||
):
|
||||
boxes_xyxy_abs = boxes.clone()
|
||||
boxes_xywh_abs = BoxMode.convert(boxes_xyxy_abs, BoxMode.XYXY_ABS, BoxMode.XYWH_ABS)
|
||||
box_xywh = make_int_box(boxes_xywh_abs[0])
|
||||
|
||||
labels = resample_fine_and_coarse_segm_tensors_to_bbox(fine_segm, coarse_segm, box_xywh).squeeze(0)
|
||||
uv = resample_uv_tensors_to_bbox(u, v, labels, box_xywh)
|
||||
confidences = []
|
||||
return box_xywh, labels, uv
|
||||
|
||||
def resample_fine_and_coarse_segm_tensors_to_bbox(
|
||||
fine_segm: torch.Tensor, coarse_segm: torch.Tensor, box_xywh_abs: IntTupleBox
|
||||
):
|
||||
"""
|
||||
Resample fine and coarse segmentation tensors to the given
|
||||
bounding box and derive labels for each pixel of the bounding box
|
||||
|
||||
Args:
|
||||
fine_segm: float tensor of shape [1, C, Hout, Wout]
|
||||
coarse_segm: float tensor of shape [1, K, Hout, Wout]
|
||||
box_xywh_abs (tuple of 4 int): bounding box given by its upper-left
|
||||
corner coordinates, width (W) and height (H)
|
||||
Return:
|
||||
Labels for each pixel of the bounding box, a long tensor of size [1, H, W]
|
||||
"""
|
||||
x, y, w, h = box_xywh_abs
|
||||
w = max(int(w), 1)
|
||||
h = max(int(h), 1)
|
||||
# coarse segmentation
|
||||
coarse_segm_bbox = F.interpolate(
|
||||
coarse_segm,
|
||||
(h, w),
|
||||
mode="bilinear",
|
||||
align_corners=False,
|
||||
).argmax(dim=1)
|
||||
# combined coarse and fine segmentation
|
||||
labels = (
|
||||
F.interpolate(fine_segm, (h, w), mode="bilinear", align_corners=False).argmax(dim=1)
|
||||
* (coarse_segm_bbox > 0).long()
|
||||
)
|
||||
return labels
|
||||
|
||||
def resample_uv_tensors_to_bbox(
|
||||
u: torch.Tensor,
|
||||
v: torch.Tensor,
|
||||
labels: torch.Tensor,
|
||||
box_xywh_abs: IntTupleBox,
|
||||
) -> torch.Tensor:
|
||||
"""
|
||||
Resamples U and V coordinate estimates for the given bounding box
|
||||
|
||||
Args:
|
||||
u (tensor [1, C, H, W] of float): U coordinates
|
||||
v (tensor [1, C, H, W] of float): V coordinates
|
||||
labels (tensor [H, W] of long): labels obtained by resampling segmentation
|
||||
outputs for the given bounding box
|
||||
box_xywh_abs (tuple of 4 int): bounding box that corresponds to predictor outputs
|
||||
Return:
|
||||
Resampled U and V coordinates - a tensor [2, H, W] of float
|
||||
"""
|
||||
x, y, w, h = box_xywh_abs
|
||||
w = max(int(w), 1)
|
||||
h = max(int(h), 1)
|
||||
u_bbox = F.interpolate(u, (h, w), mode="bilinear", align_corners=False)
|
||||
v_bbox = F.interpolate(v, (h, w), mode="bilinear", align_corners=False)
|
||||
uv = torch.zeros([2, h, w], dtype=torch.float32, device=u.device)
|
||||
for part_id in range(1, u_bbox.size(1)):
|
||||
uv[0][labels == part_id] = u_bbox[0, part_id][labels == part_id]
|
||||
uv[1][labels == part_id] = v_bbox[0, part_id][labels == part_id]
|
||||
return uv
|
||||
|
||||
79
extensions-builtin/forge_legacy_preprocessors/annotator/depth_anything.py
Executable file
79
extensions-builtin/forge_legacy_preprocessors/annotator/depth_anything.py
Executable file
@@ -0,0 +1,79 @@
|
||||
import os
|
||||
import torch
|
||||
import cv2
|
||||
import numpy as np
|
||||
import torch.nn.functional as F
|
||||
from torchvision.transforms import Compose
|
||||
|
||||
from depth_anything.dpt import DPT_DINOv2
|
||||
from depth_anything.util.transform import Resize, NormalizeImage, PrepareForNet
|
||||
from .util import load_model
|
||||
from .annotator_path import models_path
|
||||
|
||||
|
||||
transform = Compose(
|
||||
[
|
||||
Resize(
|
||||
width=518,
|
||||
height=518,
|
||||
resize_target=False,
|
||||
keep_aspect_ratio=True,
|
||||
ensure_multiple_of=14,
|
||||
resize_method="lower_bound",
|
||||
image_interpolation_method=cv2.INTER_CUBIC,
|
||||
),
|
||||
NormalizeImage(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
|
||||
PrepareForNet(),
|
||||
]
|
||||
)
|
||||
|
||||
|
||||
class DepthAnythingDetector:
|
||||
"""https://github.com/LiheYoung/Depth-Anything"""
|
||||
|
||||
model_dir = os.path.join(models_path, "depth_anything")
|
||||
|
||||
def __init__(self, device: torch.device):
|
||||
self.device = device
|
||||
self.model = (
|
||||
DPT_DINOv2(
|
||||
encoder="vitl",
|
||||
features=256,
|
||||
out_channels=[256, 512, 1024, 1024],
|
||||
localhub=False,
|
||||
)
|
||||
.to(device)
|
||||
.eval()
|
||||
)
|
||||
remote_url = os.environ.get(
|
||||
"CONTROLNET_DEPTH_ANYTHING_MODEL_URL",
|
||||
"https://huggingface.co/spaces/LiheYoung/Depth-Anything/resolve/main/checkpoints/depth_anything_vitl14.pth",
|
||||
)
|
||||
model_path = load_model(
|
||||
"depth_anything_vitl14.pth", remote_url=remote_url, model_dir=self.model_dir
|
||||
)
|
||||
self.model.load_state_dict(torch.load(model_path))
|
||||
|
||||
def __call__(self, image: np.ndarray, colored: bool = True) -> np.ndarray:
|
||||
self.model.to(self.device)
|
||||
h, w = image.shape[:2]
|
||||
|
||||
image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB) / 255.0
|
||||
image = transform({"image": image})["image"]
|
||||
image = torch.from_numpy(image).unsqueeze(0).to(self.device)
|
||||
@torch.no_grad()
|
||||
def predict_depth(model, image):
|
||||
return model(image)
|
||||
depth = predict_depth(self.model, image)
|
||||
depth = F.interpolate(
|
||||
depth[None], (h, w), mode="bilinear", align_corners=False
|
||||
)[0, 0]
|
||||
depth = (depth - depth.min()) / (depth.max() - depth.min()) * 255.0
|
||||
depth = depth.cpu().numpy().astype(np.uint8)
|
||||
if colored:
|
||||
return cv2.applyColorMap(depth, cv2.COLORMAP_INFERNO)[:, :, ::-1]
|
||||
else:
|
||||
return depth
|
||||
|
||||
def unload_model(self):
|
||||
self.model.to("cpu")
|
||||
78
extensions-builtin/forge_legacy_preprocessors/annotator/depth_anything_v2.py
Executable file
78
extensions-builtin/forge_legacy_preprocessors/annotator/depth_anything_v2.py
Executable file
@@ -0,0 +1,78 @@
|
||||
import os
|
||||
import torch
|
||||
import cv2
|
||||
import numpy as np
|
||||
import torch.nn.functional as F
|
||||
from torchvision.transforms import Compose
|
||||
from safetensors.torch import load_file
|
||||
|
||||
from depth_anything_v2.dpt import DepthAnythingV2
|
||||
from depth_anything_v2.util.transform import Resize, NormalizeImage, PrepareForNet
|
||||
from .util import load_model
|
||||
from .annotator_path import models_path
|
||||
|
||||
transform = Compose(
|
||||
[
|
||||
Resize(
|
||||
width=518,
|
||||
height=518,
|
||||
resize_target=False,
|
||||
keep_aspect_ratio=True,
|
||||
ensure_multiple_of=14,
|
||||
resize_method="lower_bound",
|
||||
image_interpolation_method=cv2.INTER_CUBIC,
|
||||
),
|
||||
NormalizeImage(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
|
||||
PrepareForNet(),
|
||||
]
|
||||
)
|
||||
|
||||
class DepthAnythingV2Detector:
|
||||
"""https://github.com/MackinationsAi/Upgraded-Depth-Anything-V2"""
|
||||
|
||||
model_dir = os.path.join(models_path, "depth_anything_v2")
|
||||
|
||||
def __init__(self, device: torch.device):
|
||||
self.device = device
|
||||
self.model = (
|
||||
DepthAnythingV2(
|
||||
encoder="vitl",
|
||||
features=256,
|
||||
out_channels=[256, 512, 1024, 1024],
|
||||
)
|
||||
.to(device)
|
||||
.eval()
|
||||
)
|
||||
remote_url = os.environ.get(
|
||||
"CONTROLNET_DEPTH_ANYTHING_V2_MODEL_URL",
|
||||
"https://huggingface.co/MackinationsAi/Depth-Anything-V2_Safetensors/resolve/main/depth_anything_v2_vitl.safetensors",
|
||||
)
|
||||
model_path = load_model(
|
||||
"depth_anything_v2_vitl.safetensors", remote_url=remote_url, model_dir=self.model_dir
|
||||
)
|
||||
self.model.load_state_dict(load_file(model_path))
|
||||
|
||||
def __call__(self, image: np.ndarray, colored: bool = True) -> np.ndarray:
|
||||
self.model.to(self.device)
|
||||
h, w = image.shape[:2]
|
||||
|
||||
image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB) / 255.0
|
||||
image = transform({"image": image})["image"]
|
||||
image = torch.from_numpy(image).unsqueeze(0).to(self.device)
|
||||
@torch.no_grad()
|
||||
def predict_depth(model, image):
|
||||
return model(image)
|
||||
depth = predict_depth(self.model, image)
|
||||
depth = F.interpolate(
|
||||
depth[None], (h, w), mode="bilinear", align_corners=False
|
||||
)[0, 0]
|
||||
depth = (depth - depth.min()) / (depth.max() - depth.min()) * 255.0
|
||||
depth = depth.cpu().numpy().astype(np.uint8)
|
||||
if colored:
|
||||
depth_color = cv2.applyColorMap(depth, cv2.COLORMAP_INFERNO)[:, :, ::-1]
|
||||
return depth_color
|
||||
else:
|
||||
return depth
|
||||
|
||||
def unload_model(self):
|
||||
self.model.to("cpu")
|
||||
98
extensions-builtin/forge_legacy_preprocessors/annotator/hed/__init__.py
Executable file
98
extensions-builtin/forge_legacy_preprocessors/annotator/hed/__init__.py
Executable file
@@ -0,0 +1,98 @@
|
||||
# This is an improved version and model of HED edge detection with Apache License, Version 2.0.
|
||||
# Please use this implementation in your products
|
||||
# This implementation may produce slightly different results from Saining Xie's official implementations,
|
||||
# but it generates smoother edges and is more suitable for ControlNet as well as other image-to-image translations.
|
||||
# Different from official models and other implementations, this is an RGB-input model (rather than BGR)
|
||||
# and in this way it works better for gradio's RGB protocol
|
||||
|
||||
import os
|
||||
import cv2
|
||||
import torch
|
||||
import numpy as np
|
||||
|
||||
from einops import rearrange
|
||||
import os
|
||||
from modules import devices
|
||||
from annotator.annotator_path import models_path
|
||||
from annotator.util import safe_step, nms
|
||||
|
||||
|
||||
class DoubleConvBlock(torch.nn.Module):
|
||||
def __init__(self, input_channel, output_channel, layer_number):
|
||||
super().__init__()
|
||||
self.convs = torch.nn.Sequential()
|
||||
self.convs.append(torch.nn.Conv2d(in_channels=input_channel, out_channels=output_channel, kernel_size=(3, 3), stride=(1, 1), padding=1))
|
||||
for i in range(1, layer_number):
|
||||
self.convs.append(torch.nn.Conv2d(in_channels=output_channel, out_channels=output_channel, kernel_size=(3, 3), stride=(1, 1), padding=1))
|
||||
self.projection = torch.nn.Conv2d(in_channels=output_channel, out_channels=1, kernel_size=(1, 1), stride=(1, 1), padding=0)
|
||||
|
||||
def __call__(self, x, down_sampling=False):
|
||||
h = x
|
||||
if down_sampling:
|
||||
h = torch.nn.functional.max_pool2d(h, kernel_size=(2, 2), stride=(2, 2))
|
||||
for conv in self.convs:
|
||||
h = conv(h)
|
||||
h = torch.nn.functional.relu(h)
|
||||
return h, self.projection(h)
|
||||
|
||||
|
||||
class ControlNetHED_Apache2(torch.nn.Module):
|
||||
def __init__(self):
|
||||
super().__init__()
|
||||
self.norm = torch.nn.Parameter(torch.zeros(size=(1, 3, 1, 1)))
|
||||
self.block1 = DoubleConvBlock(input_channel=3, output_channel=64, layer_number=2)
|
||||
self.block2 = DoubleConvBlock(input_channel=64, output_channel=128, layer_number=2)
|
||||
self.block3 = DoubleConvBlock(input_channel=128, output_channel=256, layer_number=3)
|
||||
self.block4 = DoubleConvBlock(input_channel=256, output_channel=512, layer_number=3)
|
||||
self.block5 = DoubleConvBlock(input_channel=512, output_channel=512, layer_number=3)
|
||||
|
||||
def __call__(self, x):
|
||||
h = x - self.norm
|
||||
h, projection1 = self.block1(h)
|
||||
h, projection2 = self.block2(h, down_sampling=True)
|
||||
h, projection3 = self.block3(h, down_sampling=True)
|
||||
h, projection4 = self.block4(h, down_sampling=True)
|
||||
h, projection5 = self.block5(h, down_sampling=True)
|
||||
return projection1, projection2, projection3, projection4, projection5
|
||||
|
||||
|
||||
netNetwork = None
|
||||
remote_model_path = "https://huggingface.co/lllyasviel/Annotators/resolve/main/ControlNetHED.pth"
|
||||
modeldir = os.path.join(models_path, "hed")
|
||||
old_modeldir = os.path.dirname(os.path.realpath(__file__))
|
||||
|
||||
|
||||
def apply_hed(input_image, is_safe=False):
|
||||
global netNetwork
|
||||
if netNetwork is None:
|
||||
modelpath = os.path.join(modeldir, "ControlNetHED.pth")
|
||||
old_modelpath = os.path.join(old_modeldir, "ControlNetHED.pth")
|
||||
if os.path.exists(old_modelpath):
|
||||
modelpath = old_modelpath
|
||||
elif not os.path.exists(modelpath):
|
||||
from modules.modelloader import load_file_from_url
|
||||
load_file_from_url(remote_model_path, model_dir=modeldir)
|
||||
netNetwork = ControlNetHED_Apache2().to(devices.get_device_for("controlnet"))
|
||||
netNetwork.load_state_dict(torch.load(modelpath, map_location='cpu'))
|
||||
netNetwork.to(devices.get_device_for("controlnet")).float().eval()
|
||||
|
||||
assert input_image.ndim == 3
|
||||
H, W, C = input_image.shape
|
||||
with torch.no_grad():
|
||||
image_hed = torch.from_numpy(input_image.copy()).float().to(devices.get_device_for("controlnet"))
|
||||
image_hed = rearrange(image_hed, 'h w c -> 1 c h w')
|
||||
edges = netNetwork(image_hed)
|
||||
edges = [e.detach().cpu().numpy().astype(np.float32)[0, 0] for e in edges]
|
||||
edges = [cv2.resize(e, (W, H), interpolation=cv2.INTER_LINEAR) for e in edges]
|
||||
edges = np.stack(edges, axis=2)
|
||||
edge = 1 / (1 + np.exp(-np.mean(edges, axis=2).astype(np.float64)))
|
||||
if is_safe:
|
||||
edge = safe_step(edge)
|
||||
edge = (edge * 255.0).clip(0, 255).astype(np.uint8)
|
||||
return edge
|
||||
|
||||
|
||||
def unload_hed_model():
|
||||
global netNetwork
|
||||
if netNetwork is not None:
|
||||
netNetwork.cpu()
|
||||
212
extensions-builtin/forge_legacy_preprocessors/annotator/keypose/__init__.py
Executable file
212
extensions-builtin/forge_legacy_preprocessors/annotator/keypose/__init__.py
Executable file
@@ -0,0 +1,212 @@
|
||||
import numpy as np
|
||||
import cv2
|
||||
import torch
|
||||
|
||||
import os
|
||||
from modules import devices
|
||||
from annotator.annotator_path import models_path
|
||||
|
||||
import mmcv
|
||||
from mmdet.apis import inference_detector, init_detector
|
||||
from mmpose.apis import inference_top_down_pose_model
|
||||
from mmpose.apis import init_pose_model, process_mmdet_results, vis_pose_result
|
||||
|
||||
|
||||
def preprocessing(image, device):
|
||||
# Resize
|
||||
scale = 640 / max(image.shape[:2])
|
||||
image = cv2.resize(image, dsize=None, fx=scale, fy=scale)
|
||||
raw_image = image.astype(np.uint8)
|
||||
|
||||
# Subtract mean values
|
||||
image = image.astype(np.float32)
|
||||
image -= np.array(
|
||||
[
|
||||
float(104.008),
|
||||
float(116.669),
|
||||
float(122.675),
|
||||
]
|
||||
)
|
||||
|
||||
# Convert to torch.Tensor and add "batch" axis
|
||||
image = torch.from_numpy(image.transpose(2, 0, 1)).float().unsqueeze(0)
|
||||
image = image.to(device)
|
||||
|
||||
return image, raw_image
|
||||
|
||||
|
||||
def imshow_keypoints(img,
|
||||
pose_result,
|
||||
skeleton=None,
|
||||
kpt_score_thr=0.1,
|
||||
pose_kpt_color=None,
|
||||
pose_link_color=None,
|
||||
radius=4,
|
||||
thickness=1):
|
||||
"""Draw keypoints and links on an image.
|
||||
Args:
|
||||
img (ndarry): The image to draw poses on.
|
||||
pose_result (list[kpts]): The poses to draw. Each element kpts is
|
||||
a set of K keypoints as an Kx3 numpy.ndarray, where each
|
||||
keypoint is represented as x, y, score.
|
||||
kpt_score_thr (float, optional): Minimum score of keypoints
|
||||
to be shown. Default: 0.3.
|
||||
pose_kpt_color (np.array[Nx3]`): Color of N keypoints. If None,
|
||||
the keypoint will not be drawn.
|
||||
pose_link_color (np.array[Mx3]): Color of M links. If None, the
|
||||
links will not be drawn.
|
||||
thickness (int): Thickness of lines.
|
||||
"""
|
||||
|
||||
img_h, img_w, _ = img.shape
|
||||
img = np.zeros(img.shape)
|
||||
|
||||
for idx, kpts in enumerate(pose_result):
|
||||
if idx > 1:
|
||||
continue
|
||||
kpts = kpts['keypoints']
|
||||
# print(kpts)
|
||||
kpts = np.array(kpts, copy=False)
|
||||
|
||||
# draw each point on image
|
||||
if pose_kpt_color is not None:
|
||||
assert len(pose_kpt_color) == len(kpts)
|
||||
|
||||
for kid, kpt in enumerate(kpts):
|
||||
x_coord, y_coord, kpt_score = int(kpt[0]), int(kpt[1]), kpt[2]
|
||||
|
||||
if kpt_score < kpt_score_thr or pose_kpt_color[kid] is None:
|
||||
# skip the point that should not be drawn
|
||||
continue
|
||||
|
||||
color = tuple(int(c) for c in pose_kpt_color[kid])
|
||||
cv2.circle(img, (int(x_coord), int(y_coord)),
|
||||
radius, color, -1)
|
||||
|
||||
# draw links
|
||||
if skeleton is not None and pose_link_color is not None:
|
||||
assert len(pose_link_color) == len(skeleton)
|
||||
|
||||
for sk_id, sk in enumerate(skeleton):
|
||||
pos1 = (int(kpts[sk[0], 0]), int(kpts[sk[0], 1]))
|
||||
pos2 = (int(kpts[sk[1], 0]), int(kpts[sk[1], 1]))
|
||||
|
||||
if (pos1[0] <= 0 or pos1[0] >= img_w or pos1[1] <= 0 or pos1[1] >= img_h or pos2[0] <= 0
|
||||
or pos2[0] >= img_w or pos2[1] <= 0 or pos2[1] >= img_h or kpts[sk[0], 2] < kpt_score_thr
|
||||
or kpts[sk[1], 2] < kpt_score_thr or pose_link_color[sk_id] is None):
|
||||
# skip the link that should not be drawn
|
||||
continue
|
||||
color = tuple(int(c) for c in pose_link_color[sk_id])
|
||||
cv2.line(img, pos1, pos2, color, thickness=thickness)
|
||||
|
||||
return img
|
||||
|
||||
|
||||
human_det, pose_model = None, None
|
||||
det_model_path = "https://download.openmmlab.com/mmdetection/v2.0/faster_rcnn/faster_rcnn_r50_fpn_1x_coco/faster_rcnn_r50_fpn_1x_coco_20200130-047c8118.pth"
|
||||
pose_model_path = "https://download.openmmlab.com/mmpose/top_down/hrnet/hrnet_w48_coco_256x192-b9e0b3ab_20200708.pth"
|
||||
|
||||
modeldir = os.path.join(models_path, "keypose")
|
||||
old_modeldir = os.path.dirname(os.path.realpath(__file__))
|
||||
|
||||
det_config = 'faster_rcnn_r50_fpn_coco.py'
|
||||
pose_config = 'hrnet_w48_coco_256x192.py'
|
||||
|
||||
det_checkpoint = 'faster_rcnn_r50_fpn_1x_coco_20200130-047c8118.pth'
|
||||
pose_checkpoint = 'hrnet_w48_coco_256x192-b9e0b3ab_20200708.pth'
|
||||
det_cat_id = 1
|
||||
bbox_thr = 0.2
|
||||
|
||||
skeleton = [
|
||||
[15, 13], [13, 11], [16, 14], [14, 12], [11, 12], [5, 11], [6, 12], [5, 6], [5, 7], [6, 8],
|
||||
[7, 9], [8, 10],
|
||||
[1, 2], [0, 1], [0, 2], [1, 3], [2, 4], [3, 5], [4, 6]
|
||||
]
|
||||
|
||||
pose_kpt_color = [
|
||||
[51, 153, 255], [51, 153, 255], [51, 153, 255], [51, 153, 255], [51, 153, 255],
|
||||
[0, 255, 0],
|
||||
[255, 128, 0], [0, 255, 0], [255, 128, 0], [0, 255, 0], [255, 128, 0], [0, 255, 0],
|
||||
[255, 128, 0],
|
||||
[0, 255, 0], [255, 128, 0], [0, 255, 0], [255, 128, 0]
|
||||
]
|
||||
|
||||
pose_link_color = [
|
||||
[0, 255, 0], [0, 255, 0], [255, 128, 0], [255, 128, 0],
|
||||
[51, 153, 255], [51, 153, 255], [51, 153, 255], [51, 153, 255], [0, 255, 0],
|
||||
[255, 128, 0],
|
||||
[0, 255, 0], [255, 128, 0], [51, 153, 255], [51, 153, 255], [51, 153, 255],
|
||||
[51, 153, 255],
|
||||
[51, 153, 255], [51, 153, 255], [51, 153, 255]
|
||||
]
|
||||
|
||||
def find_download_model(checkpoint, remote_path):
|
||||
modelpath = os.path.join(modeldir, checkpoint)
|
||||
old_modelpath = os.path.join(old_modeldir, checkpoint)
|
||||
|
||||
if os.path.exists(old_modelpath):
|
||||
modelpath = old_modelpath
|
||||
elif not os.path.exists(modelpath):
|
||||
from modules.modelloader import load_file_from_url
|
||||
load_file_from_url(remote_path, model_dir=modeldir)
|
||||
|
||||
return modelpath
|
||||
|
||||
def apply_keypose(input_image):
|
||||
global human_det, pose_model
|
||||
if netNetwork is None:
|
||||
det_model_local = find_download_model(det_checkpoint, det_model_path)
|
||||
hrnet_model_local = find_download_model(pose_checkpoint, pose_model_path)
|
||||
det_config_mmcv = mmcv.Config.fromfile(det_config)
|
||||
pose_config_mmcv = mmcv.Config.fromfile(pose_config)
|
||||
human_det = init_detector(det_config_mmcv, det_model_local, device=devices.get_device_for("controlnet"))
|
||||
pose_model = init_pose_model(pose_config_mmcv, hrnet_model_local, device=devices.get_device_for("controlnet"))
|
||||
|
||||
assert input_image.ndim == 3
|
||||
input_image = input_image.copy()
|
||||
with torch.no_grad():
|
||||
image = torch.from_numpy(input_image).float().to(devices.get_device_for("controlnet"))
|
||||
image = image / 255.0
|
||||
mmdet_results = inference_detector(human_det, image)
|
||||
|
||||
# keep the person class bounding boxes.
|
||||
person_results = process_mmdet_results(mmdet_results, det_cat_id)
|
||||
|
||||
return_heatmap = False
|
||||
dataset = pose_model.cfg.data['test']['type']
|
||||
|
||||
# e.g. use ('backbone', ) to return backbone feature
|
||||
output_layer_names = None
|
||||
pose_results, _ = inference_top_down_pose_model(
|
||||
pose_model,
|
||||
image,
|
||||
person_results,
|
||||
bbox_thr=bbox_thr,
|
||||
format='xyxy',
|
||||
dataset=dataset,
|
||||
dataset_info=None,
|
||||
return_heatmap=return_heatmap,
|
||||
outputs=output_layer_names
|
||||
)
|
||||
|
||||
im_keypose_out = imshow_keypoints(
|
||||
image,
|
||||
pose_results,
|
||||
skeleton=skeleton,
|
||||
pose_kpt_color=pose_kpt_color,
|
||||
pose_link_color=pose_link_color,
|
||||
radius=2,
|
||||
thickness=2
|
||||
)
|
||||
im_keypose_out = im_keypose_out.astype(np.uint8)
|
||||
|
||||
# image_hed = rearrange(image_hed, 'h w c -> 1 c h w')
|
||||
# edge = netNetwork(image_hed)[0]
|
||||
# edge = (edge.cpu().numpy() * 255.0).clip(0, 255).astype(np.uint8)
|
||||
return im_keypose_out
|
||||
|
||||
|
||||
def unload_hed_model():
|
||||
global netNetwork
|
||||
if netNetwork is not None:
|
||||
netNetwork.cpu()
|
||||
@@ -0,0 +1,182 @@
|
||||
checkpoint_config = dict(interval=1)
|
||||
# yapf:disable
|
||||
log_config = dict(
|
||||
interval=50,
|
||||
hooks=[
|
||||
dict(type='TextLoggerHook'),
|
||||
# dict(type='TensorboardLoggerHook')
|
||||
])
|
||||
# yapf:enable
|
||||
dist_params = dict(backend='nccl')
|
||||
log_level = 'INFO'
|
||||
load_from = None
|
||||
resume_from = None
|
||||
workflow = [('train', 1)]
|
||||
# optimizer
|
||||
optimizer = dict(type='SGD', lr=0.02, momentum=0.9, weight_decay=0.0001)
|
||||
optimizer_config = dict(grad_clip=None)
|
||||
# learning policy
|
||||
lr_config = dict(
|
||||
policy='step',
|
||||
warmup='linear',
|
||||
warmup_iters=500,
|
||||
warmup_ratio=0.001,
|
||||
step=[8, 11])
|
||||
total_epochs = 12
|
||||
|
||||
model = dict(
|
||||
type='FasterRCNN',
|
||||
pretrained='torchvision://resnet50',
|
||||
backbone=dict(
|
||||
type='ResNet',
|
||||
depth=50,
|
||||
num_stages=4,
|
||||
out_indices=(0, 1, 2, 3),
|
||||
frozen_stages=1,
|
||||
norm_cfg=dict(type='BN', requires_grad=True),
|
||||
norm_eval=True,
|
||||
style='pytorch'),
|
||||
neck=dict(
|
||||
type='FPN',
|
||||
in_channels=[256, 512, 1024, 2048],
|
||||
out_channels=256,
|
||||
num_outs=5),
|
||||
rpn_head=dict(
|
||||
type='RPNHead',
|
||||
in_channels=256,
|
||||
feat_channels=256,
|
||||
anchor_generator=dict(
|
||||
type='AnchorGenerator',
|
||||
scales=[8],
|
||||
ratios=[0.5, 1.0, 2.0],
|
||||
strides=[4, 8, 16, 32, 64]),
|
||||
bbox_coder=dict(
|
||||
type='DeltaXYWHBBoxCoder',
|
||||
target_means=[.0, .0, .0, .0],
|
||||
target_stds=[1.0, 1.0, 1.0, 1.0]),
|
||||
loss_cls=dict(
|
||||
type='CrossEntropyLoss', use_sigmoid=True, loss_weight=1.0),
|
||||
loss_bbox=dict(type='L1Loss', loss_weight=1.0)),
|
||||
roi_head=dict(
|
||||
type='StandardRoIHead',
|
||||
bbox_roi_extractor=dict(
|
||||
type='SingleRoIExtractor',
|
||||
roi_layer=dict(type='RoIAlign', output_size=7, sampling_ratio=0),
|
||||
out_channels=256,
|
||||
featmap_strides=[4, 8, 16, 32]),
|
||||
bbox_head=dict(
|
||||
type='Shared2FCBBoxHead',
|
||||
in_channels=256,
|
||||
fc_out_channels=1024,
|
||||
roi_feat_size=7,
|
||||
num_classes=80,
|
||||
bbox_coder=dict(
|
||||
type='DeltaXYWHBBoxCoder',
|
||||
target_means=[0., 0., 0., 0.],
|
||||
target_stds=[0.1, 0.1, 0.2, 0.2]),
|
||||
reg_class_agnostic=False,
|
||||
loss_cls=dict(
|
||||
type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0),
|
||||
loss_bbox=dict(type='L1Loss', loss_weight=1.0))),
|
||||
# model training and testing settings
|
||||
train_cfg=dict(
|
||||
rpn=dict(
|
||||
assigner=dict(
|
||||
type='MaxIoUAssigner',
|
||||
pos_iou_thr=0.7,
|
||||
neg_iou_thr=0.3,
|
||||
min_pos_iou=0.3,
|
||||
match_low_quality=True,
|
||||
ignore_iof_thr=-1),
|
||||
sampler=dict(
|
||||
type='RandomSampler',
|
||||
num=256,
|
||||
pos_fraction=0.5,
|
||||
neg_pos_ub=-1,
|
||||
add_gt_as_proposals=False),
|
||||
allowed_border=-1,
|
||||
pos_weight=-1,
|
||||
debug=False),
|
||||
rpn_proposal=dict(
|
||||
nms_pre=2000,
|
||||
max_per_img=1000,
|
||||
nms=dict(type='nms', iou_threshold=0.7),
|
||||
min_bbox_size=0),
|
||||
rcnn=dict(
|
||||
assigner=dict(
|
||||
type='MaxIoUAssigner',
|
||||
pos_iou_thr=0.5,
|
||||
neg_iou_thr=0.5,
|
||||
min_pos_iou=0.5,
|
||||
match_low_quality=False,
|
||||
ignore_iof_thr=-1),
|
||||
sampler=dict(
|
||||
type='RandomSampler',
|
||||
num=512,
|
||||
pos_fraction=0.25,
|
||||
neg_pos_ub=-1,
|
||||
add_gt_as_proposals=True),
|
||||
pos_weight=-1,
|
||||
debug=False)),
|
||||
test_cfg=dict(
|
||||
rpn=dict(
|
||||
nms_pre=1000,
|
||||
max_per_img=1000,
|
||||
nms=dict(type='nms', iou_threshold=0.7),
|
||||
min_bbox_size=0),
|
||||
rcnn=dict(
|
||||
score_thr=0.05,
|
||||
nms=dict(type='nms', iou_threshold=0.5),
|
||||
max_per_img=100)
|
||||
# soft-nms is also supported for rcnn testing
|
||||
# e.g., nms=dict(type='soft_nms', iou_threshold=0.5, min_score=0.05)
|
||||
))
|
||||
|
||||
dataset_type = 'CocoDataset'
|
||||
data_root = 'data/coco'
|
||||
img_norm_cfg = dict(
|
||||
mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True)
|
||||
train_pipeline = [
|
||||
dict(type='LoadImageFromFile'),
|
||||
dict(type='LoadAnnotations', with_bbox=True),
|
||||
dict(type='Resize', img_scale=(1333, 800), keep_ratio=True),
|
||||
dict(type='RandomFlip', flip_ratio=0.5),
|
||||
dict(type='Normalize', **img_norm_cfg),
|
||||
dict(type='Pad', size_divisor=32),
|
||||
dict(type='DefaultFormatBundle'),
|
||||
dict(type='Collect', keys=['img', 'gt_bboxes', 'gt_labels']),
|
||||
]
|
||||
test_pipeline = [
|
||||
dict(type='LoadImageFromFile'),
|
||||
dict(
|
||||
type='MultiScaleFlipAug',
|
||||
img_scale=(1333, 800),
|
||||
flip=False,
|
||||
transforms=[
|
||||
dict(type='Resize', keep_ratio=True),
|
||||
dict(type='RandomFlip'),
|
||||
dict(type='Normalize', **img_norm_cfg),
|
||||
dict(type='Pad', size_divisor=32),
|
||||
dict(type='DefaultFormatBundle'),
|
||||
dict(type='Collect', keys=['img']),
|
||||
])
|
||||
]
|
||||
data = dict(
|
||||
samples_per_gpu=2,
|
||||
workers_per_gpu=2,
|
||||
train=dict(
|
||||
type=dataset_type,
|
||||
ann_file=f'{data_root}/annotations/instances_train2017.json',
|
||||
img_prefix=f'{data_root}/train2017/',
|
||||
pipeline=train_pipeline),
|
||||
val=dict(
|
||||
type=dataset_type,
|
||||
ann_file=f'{data_root}/annotations/instances_val2017.json',
|
||||
img_prefix=f'{data_root}/val2017/',
|
||||
pipeline=test_pipeline),
|
||||
test=dict(
|
||||
type=dataset_type,
|
||||
ann_file=f'{data_root}/annotations/instances_val2017.json',
|
||||
img_prefix=f'{data_root}/val2017/',
|
||||
pipeline=test_pipeline))
|
||||
evaluation = dict(interval=1, metric='bbox')
|
||||
@@ -0,0 +1,169 @@
|
||||
# _base_ = [
|
||||
# '../../../../_base_/default_runtime.py',
|
||||
# '../../../../_base_/datasets/coco.py'
|
||||
# ]
|
||||
evaluation = dict(interval=10, metric='mAP', save_best='AP')
|
||||
|
||||
optimizer = dict(
|
||||
type='Adam',
|
||||
lr=5e-4,
|
||||
)
|
||||
optimizer_config = dict(grad_clip=None)
|
||||
# learning policy
|
||||
lr_config = dict(
|
||||
policy='step',
|
||||
warmup='linear',
|
||||
warmup_iters=500,
|
||||
warmup_ratio=0.001,
|
||||
step=[170, 200])
|
||||
total_epochs = 210
|
||||
channel_cfg = dict(
|
||||
num_output_channels=17,
|
||||
dataset_joints=17,
|
||||
dataset_channel=[
|
||||
[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16],
|
||||
],
|
||||
inference_channel=[
|
||||
0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16
|
||||
])
|
||||
|
||||
# model settings
|
||||
model = dict(
|
||||
type='TopDown',
|
||||
pretrained='https://download.openmmlab.com/mmpose/'
|
||||
'pretrain_models/hrnet_w48-8ef0771d.pth',
|
||||
backbone=dict(
|
||||
type='HRNet',
|
||||
in_channels=3,
|
||||
extra=dict(
|
||||
stage1=dict(
|
||||
num_modules=1,
|
||||
num_branches=1,
|
||||
block='BOTTLENECK',
|
||||
num_blocks=(4, ),
|
||||
num_channels=(64, )),
|
||||
stage2=dict(
|
||||
num_modules=1,
|
||||
num_branches=2,
|
||||
block='BASIC',
|
||||
num_blocks=(4, 4),
|
||||
num_channels=(48, 96)),
|
||||
stage3=dict(
|
||||
num_modules=4,
|
||||
num_branches=3,
|
||||
block='BASIC',
|
||||
num_blocks=(4, 4, 4),
|
||||
num_channels=(48, 96, 192)),
|
||||
stage4=dict(
|
||||
num_modules=3,
|
||||
num_branches=4,
|
||||
block='BASIC',
|
||||
num_blocks=(4, 4, 4, 4),
|
||||
num_channels=(48, 96, 192, 384))),
|
||||
),
|
||||
keypoint_head=dict(
|
||||
type='TopdownHeatmapSimpleHead',
|
||||
in_channels=48,
|
||||
out_channels=channel_cfg['num_output_channels'],
|
||||
num_deconv_layers=0,
|
||||
extra=dict(final_conv_kernel=1, ),
|
||||
loss_keypoint=dict(type='JointsMSELoss', use_target_weight=True)),
|
||||
train_cfg=dict(),
|
||||
test_cfg=dict(
|
||||
flip_test=True,
|
||||
post_process='default',
|
||||
shift_heatmap=True,
|
||||
modulate_kernel=11))
|
||||
|
||||
data_cfg = dict(
|
||||
image_size=[192, 256],
|
||||
heatmap_size=[48, 64],
|
||||
num_output_channels=channel_cfg['num_output_channels'],
|
||||
num_joints=channel_cfg['dataset_joints'],
|
||||
dataset_channel=channel_cfg['dataset_channel'],
|
||||
inference_channel=channel_cfg['inference_channel'],
|
||||
soft_nms=False,
|
||||
nms_thr=1.0,
|
||||
oks_thr=0.9,
|
||||
vis_thr=0.2,
|
||||
use_gt_bbox=False,
|
||||
det_bbox_thr=0.0,
|
||||
bbox_file='data/coco/person_detection_results/'
|
||||
'COCO_val2017_detections_AP_H_56_person.json',
|
||||
)
|
||||
|
||||
train_pipeline = [
|
||||
dict(type='LoadImageFromFile'),
|
||||
dict(type='TopDownGetBboxCenterScale', padding=1.25),
|
||||
dict(type='TopDownRandomShiftBboxCenter', shift_factor=0.16, prob=0.3),
|
||||
dict(type='TopDownRandomFlip', flip_prob=0.5),
|
||||
dict(
|
||||
type='TopDownHalfBodyTransform',
|
||||
num_joints_half_body=8,
|
||||
prob_half_body=0.3),
|
||||
dict(
|
||||
type='TopDownGetRandomScaleRotation', rot_factor=40, scale_factor=0.5),
|
||||
dict(type='TopDownAffine'),
|
||||
dict(type='ToTensor'),
|
||||
dict(
|
||||
type='NormalizeTensor',
|
||||
mean=[0.485, 0.456, 0.406],
|
||||
std=[0.229, 0.224, 0.225]),
|
||||
dict(type='TopDownGenerateTarget', sigma=2),
|
||||
dict(
|
||||
type='Collect',
|
||||
keys=['img', 'target', 'target_weight'],
|
||||
meta_keys=[
|
||||
'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale',
|
||||
'rotation', 'bbox_score', 'flip_pairs'
|
||||
]),
|
||||
]
|
||||
|
||||
val_pipeline = [
|
||||
dict(type='LoadImageFromFile'),
|
||||
dict(type='TopDownGetBboxCenterScale', padding=1.25),
|
||||
dict(type='TopDownAffine'),
|
||||
dict(type='ToTensor'),
|
||||
dict(
|
||||
type='NormalizeTensor',
|
||||
mean=[0.485, 0.456, 0.406],
|
||||
std=[0.229, 0.224, 0.225]),
|
||||
dict(
|
||||
type='Collect',
|
||||
keys=['img'],
|
||||
meta_keys=[
|
||||
'image_file', 'center', 'scale', 'rotation', 'bbox_score',
|
||||
'flip_pairs'
|
||||
]),
|
||||
]
|
||||
|
||||
test_pipeline = val_pipeline
|
||||
|
||||
data_root = 'data/coco'
|
||||
data = dict(
|
||||
samples_per_gpu=32,
|
||||
workers_per_gpu=2,
|
||||
val_dataloader=dict(samples_per_gpu=32),
|
||||
test_dataloader=dict(samples_per_gpu=32),
|
||||
train=dict(
|
||||
type='TopDownCocoDataset',
|
||||
ann_file=f'{data_root}/annotations/person_keypoints_train2017.json',
|
||||
img_prefix=f'{data_root}/train2017/',
|
||||
data_cfg=data_cfg,
|
||||
pipeline=train_pipeline,
|
||||
dataset_info={{_base_.dataset_info}}),
|
||||
val=dict(
|
||||
type='TopDownCocoDataset',
|
||||
ann_file=f'{data_root}/annotations/person_keypoints_val2017.json',
|
||||
img_prefix=f'{data_root}/val2017/',
|
||||
data_cfg=data_cfg,
|
||||
pipeline=val_pipeline,
|
||||
dataset_info={{_base_.dataset_info}}),
|
||||
test=dict(
|
||||
type='TopDownCocoDataset',
|
||||
ann_file=f'{data_root}/annotations/person_keypoints_val2017.json',
|
||||
img_prefix=f'{data_root}/val2017/',
|
||||
data_cfg=data_cfg,
|
||||
pipeline=test_pipeline,
|
||||
dataset_info={{_base_.dataset_info}}),
|
||||
)
|
||||
114
extensions-builtin/forge_legacy_preprocessors/annotator/leres/__init__.py
Executable file
114
extensions-builtin/forge_legacy_preprocessors/annotator/leres/__init__.py
Executable file
@@ -0,0 +1,114 @@
|
||||
import cv2
|
||||
import numpy as np
|
||||
import torch
|
||||
import os
|
||||
from modules import devices, shared
|
||||
from annotator.annotator_path import models_path
|
||||
from torchvision.transforms import transforms
|
||||
|
||||
# AdelaiDepth/LeReS imports
|
||||
from .leres.depthmap import estimateleres, estimateboost
|
||||
from .leres.multi_depth_model_woauxi import RelDepthModel
|
||||
from .leres.net_tools import strip_prefix_if_present
|
||||
|
||||
# pix2pix/merge net imports
|
||||
from .pix2pix.options.test_options import TestOptions
|
||||
from .pix2pix.models.pix2pix4depth_model import Pix2Pix4DepthModel
|
||||
|
||||
base_model_path = os.path.join(models_path, "leres")
|
||||
old_modeldir = os.path.dirname(os.path.realpath(__file__))
|
||||
|
||||
remote_model_path_leres = "https://huggingface.co/lllyasviel/Annotators/resolve/main/res101.pth"
|
||||
remote_model_path_pix2pix = "https://huggingface.co/lllyasviel/Annotators/resolve/main/latest_net_G.pth"
|
||||
|
||||
model = None
|
||||
pix2pixmodel = None
|
||||
|
||||
def unload_leres_model():
|
||||
global model, pix2pixmodel
|
||||
if model is not None:
|
||||
model = model.cpu()
|
||||
if pix2pixmodel is not None:
|
||||
pix2pixmodel = pix2pixmodel.unload_network('G')
|
||||
|
||||
|
||||
def apply_leres(input_image, thr_a, thr_b, boost=False):
|
||||
global model, pix2pixmodel
|
||||
if model is None:
|
||||
model_path = os.path.join(base_model_path, "res101.pth")
|
||||
old_model_path = os.path.join(old_modeldir, "res101.pth")
|
||||
|
||||
if os.path.exists(old_model_path):
|
||||
model_path = old_model_path
|
||||
elif not os.path.exists(model_path):
|
||||
from modules.modelloader import load_file_from_url
|
||||
load_file_from_url(remote_model_path_leres, model_dir=base_model_path)
|
||||
|
||||
if torch.cuda.is_available():
|
||||
checkpoint = torch.load(model_path)
|
||||
else:
|
||||
checkpoint = torch.load(model_path, map_location=torch.device('cpu'))
|
||||
|
||||
model = RelDepthModel(backbone='resnext101')
|
||||
model.load_state_dict(strip_prefix_if_present(checkpoint['depth_model'], "module."), strict=True)
|
||||
del checkpoint
|
||||
|
||||
if boost and pix2pixmodel is None:
|
||||
pix2pixmodel_path = os.path.join(base_model_path, "latest_net_G.pth")
|
||||
if not os.path.exists(pix2pixmodel_path):
|
||||
from modules.modelloader import load_file_from_url
|
||||
load_file_from_url(remote_model_path_pix2pix, model_dir=base_model_path)
|
||||
|
||||
opt = TestOptions().parse()
|
||||
if not torch.cuda.is_available():
|
||||
opt.gpu_ids = [] # cpu mode
|
||||
pix2pixmodel = Pix2Pix4DepthModel(opt)
|
||||
pix2pixmodel.save_dir = base_model_path
|
||||
pix2pixmodel.load_networks('latest')
|
||||
pix2pixmodel.eval()
|
||||
|
||||
if devices.get_device_for("controlnet").type != 'mps':
|
||||
model = model.to(devices.get_device_for("controlnet"))
|
||||
|
||||
assert input_image.ndim == 3
|
||||
height, width, dim = input_image.shape
|
||||
|
||||
with torch.no_grad():
|
||||
|
||||
if boost:
|
||||
pix2pixmodel.netG.to(devices.get_device_for("controlnet"))
|
||||
depth = estimateboost(input_image, model, 0, pix2pixmodel, max(width, height))
|
||||
else:
|
||||
depth = estimateleres(input_image, model, width, height)
|
||||
|
||||
numbytes=2
|
||||
depth_min = depth.min()
|
||||
depth_max = depth.max()
|
||||
max_val = (2**(8*numbytes))-1
|
||||
|
||||
# check output before normalizing and mapping to 16 bit
|
||||
if depth_max - depth_min > np.finfo("float").eps:
|
||||
out = max_val * (depth - depth_min) / (depth_max - depth_min)
|
||||
else:
|
||||
out = np.zeros(depth.shape)
|
||||
|
||||
# single channel, 16 bit image
|
||||
depth_image = out.astype("uint16")
|
||||
|
||||
# convert to uint8
|
||||
depth_image = cv2.convertScaleAbs(depth_image, alpha=(255.0/65535.0))
|
||||
|
||||
# remove near
|
||||
if thr_a != 0:
|
||||
thr_a = ((thr_a/100)*255)
|
||||
depth_image = cv2.threshold(depth_image, thr_a, 255, cv2.THRESH_TOZERO)[1]
|
||||
|
||||
# invert image
|
||||
depth_image = cv2.bitwise_not(depth_image)
|
||||
|
||||
# remove bg
|
||||
if thr_b != 0:
|
||||
thr_b = ((thr_b/100)*255)
|
||||
depth_image = cv2.threshold(depth_image, thr_b, 255, cv2.THRESH_TOZERO)[1]
|
||||
|
||||
return depth_image
|
||||
23
extensions-builtin/forge_legacy_preprocessors/annotator/leres/leres/LICENSE
Executable file
23
extensions-builtin/forge_legacy_preprocessors/annotator/leres/leres/LICENSE
Executable file
@@ -0,0 +1,23 @@
|
||||
https://github.com/thygate/stable-diffusion-webui-depthmap-script
|
||||
|
||||
MIT License
|
||||
|
||||
Copyright (c) 2023 Bob Thiry
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
of this software and associated documentation files (the "Software"), to deal
|
||||
in the Software without restriction, including without limitation the rights
|
||||
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
copies of the Software, and to permit persons to whom the Software is
|
||||
furnished to do so, subject to the following conditions:
|
||||
|
||||
The above copyright notice and this permission notice shall be included in all
|
||||
copies or substantial portions of the Software.
|
||||
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
||||
SOFTWARE.
|
||||
199
extensions-builtin/forge_legacy_preprocessors/annotator/leres/leres/Resnet.py
Executable file
199
extensions-builtin/forge_legacy_preprocessors/annotator/leres/leres/Resnet.py
Executable file
@@ -0,0 +1,199 @@
|
||||
import torch.nn as nn
|
||||
import torch.nn as NN
|
||||
|
||||
__all__ = ['ResNet', 'resnet18', 'resnet34', 'resnet50', 'resnet101',
|
||||
'resnet152']
|
||||
|
||||
|
||||
model_urls = {
|
||||
'resnet18': 'https://download.pytorch.org/models/resnet18-5c106cde.pth',
|
||||
'resnet34': 'https://download.pytorch.org/models/resnet34-333f7ec4.pth',
|
||||
'resnet50': 'https://download.pytorch.org/models/resnet50-19c8e357.pth',
|
||||
'resnet101': 'https://download.pytorch.org/models/resnet101-5d3b4d8f.pth',
|
||||
'resnet152': 'https://download.pytorch.org/models/resnet152-b121ed2d.pth',
|
||||
}
|
||||
|
||||
|
||||
def conv3x3(in_planes, out_planes, stride=1):
|
||||
"""3x3 convolution with padding"""
|
||||
return nn.Conv2d(in_planes, out_planes, kernel_size=3, stride=stride,
|
||||
padding=1, bias=False)
|
||||
|
||||
|
||||
class BasicBlock(nn.Module):
|
||||
expansion = 1
|
||||
|
||||
def __init__(self, inplanes, planes, stride=1, downsample=None):
|
||||
super(BasicBlock, self).__init__()
|
||||
self.conv1 = conv3x3(inplanes, planes, stride)
|
||||
self.bn1 = NN.BatchNorm2d(planes) #NN.BatchNorm2d
|
||||
self.relu = nn.ReLU(inplace=True)
|
||||
self.conv2 = conv3x3(planes, planes)
|
||||
self.bn2 = NN.BatchNorm2d(planes) #NN.BatchNorm2d
|
||||
self.downsample = downsample
|
||||
self.stride = stride
|
||||
|
||||
def forward(self, x):
|
||||
residual = x
|
||||
|
||||
out = self.conv1(x)
|
||||
out = self.bn1(out)
|
||||
out = self.relu(out)
|
||||
|
||||
out = self.conv2(out)
|
||||
out = self.bn2(out)
|
||||
|
||||
if self.downsample is not None:
|
||||
residual = self.downsample(x)
|
||||
|
||||
out += residual
|
||||
out = self.relu(out)
|
||||
|
||||
return out
|
||||
|
||||
|
||||
class Bottleneck(nn.Module):
|
||||
expansion = 4
|
||||
|
||||
def __init__(self, inplanes, planes, stride=1, downsample=None):
|
||||
super(Bottleneck, self).__init__()
|
||||
self.conv1 = nn.Conv2d(inplanes, planes, kernel_size=1, bias=False)
|
||||
self.bn1 = NN.BatchNorm2d(planes) #NN.BatchNorm2d
|
||||
self.conv2 = nn.Conv2d(planes, planes, kernel_size=3, stride=stride,
|
||||
padding=1, bias=False)
|
||||
self.bn2 = NN.BatchNorm2d(planes) #NN.BatchNorm2d
|
||||
self.conv3 = nn.Conv2d(planes, planes * self.expansion, kernel_size=1, bias=False)
|
||||
self.bn3 = NN.BatchNorm2d(planes * self.expansion) #NN.BatchNorm2d
|
||||
self.relu = nn.ReLU(inplace=True)
|
||||
self.downsample = downsample
|
||||
self.stride = stride
|
||||
|
||||
def forward(self, x):
|
||||
residual = x
|
||||
|
||||
out = self.conv1(x)
|
||||
out = self.bn1(out)
|
||||
out = self.relu(out)
|
||||
|
||||
out = self.conv2(out)
|
||||
out = self.bn2(out)
|
||||
out = self.relu(out)
|
||||
|
||||
out = self.conv3(out)
|
||||
out = self.bn3(out)
|
||||
|
||||
if self.downsample is not None:
|
||||
residual = self.downsample(x)
|
||||
|
||||
out += residual
|
||||
out = self.relu(out)
|
||||
|
||||
return out
|
||||
|
||||
|
||||
class ResNet(nn.Module):
|
||||
|
||||
def __init__(self, block, layers, num_classes=1000):
|
||||
self.inplanes = 64
|
||||
super(ResNet, self).__init__()
|
||||
self.conv1 = nn.Conv2d(3, 64, kernel_size=7, stride=2, padding=3,
|
||||
bias=False)
|
||||
self.bn1 = NN.BatchNorm2d(64) #NN.BatchNorm2d
|
||||
self.relu = nn.ReLU(inplace=True)
|
||||
self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1)
|
||||
self.layer1 = self._make_layer(block, 64, layers[0])
|
||||
self.layer2 = self._make_layer(block, 128, layers[1], stride=2)
|
||||
self.layer3 = self._make_layer(block, 256, layers[2], stride=2)
|
||||
self.layer4 = self._make_layer(block, 512, layers[3], stride=2)
|
||||
#self.avgpool = nn.AvgPool2d(7, stride=1)
|
||||
#self.fc = nn.Linear(512 * block.expansion, num_classes)
|
||||
|
||||
for m in self.modules():
|
||||
if isinstance(m, nn.Conv2d):
|
||||
nn.init.kaiming_normal_(m.weight, mode='fan_out', nonlinearity='relu')
|
||||
elif isinstance(m, nn.BatchNorm2d):
|
||||
nn.init.constant_(m.weight, 1)
|
||||
nn.init.constant_(m.bias, 0)
|
||||
|
||||
def _make_layer(self, block, planes, blocks, stride=1):
|
||||
downsample = None
|
||||
if stride != 1 or self.inplanes != planes * block.expansion:
|
||||
downsample = nn.Sequential(
|
||||
nn.Conv2d(self.inplanes, planes * block.expansion,
|
||||
kernel_size=1, stride=stride, bias=False),
|
||||
NN.BatchNorm2d(planes * block.expansion), #NN.BatchNorm2d
|
||||
)
|
||||
|
||||
layers = []
|
||||
layers.append(block(self.inplanes, planes, stride, downsample))
|
||||
self.inplanes = planes * block.expansion
|
||||
for i in range(1, blocks):
|
||||
layers.append(block(self.inplanes, planes))
|
||||
|
||||
return nn.Sequential(*layers)
|
||||
|
||||
def forward(self, x):
|
||||
features = []
|
||||
|
||||
x = self.conv1(x)
|
||||
x = self.bn1(x)
|
||||
x = self.relu(x)
|
||||
x = self.maxpool(x)
|
||||
|
||||
x = self.layer1(x)
|
||||
features.append(x)
|
||||
x = self.layer2(x)
|
||||
features.append(x)
|
||||
x = self.layer3(x)
|
||||
features.append(x)
|
||||
x = self.layer4(x)
|
||||
features.append(x)
|
||||
|
||||
return features
|
||||
|
||||
|
||||
def resnet18(pretrained=True, **kwargs):
|
||||
"""Constructs a ResNet-18 model.
|
||||
Args:
|
||||
pretrained (bool): If True, returns a model pre-trained on ImageNet
|
||||
"""
|
||||
model = ResNet(BasicBlock, [2, 2, 2, 2], **kwargs)
|
||||
return model
|
||||
|
||||
|
||||
def resnet34(pretrained=True, **kwargs):
|
||||
"""Constructs a ResNet-34 model.
|
||||
Args:
|
||||
pretrained (bool): If True, returns a model pre-trained on ImageNet
|
||||
"""
|
||||
model = ResNet(BasicBlock, [3, 4, 6, 3], **kwargs)
|
||||
return model
|
||||
|
||||
|
||||
def resnet50(pretrained=True, **kwargs):
|
||||
"""Constructs a ResNet-50 model.
|
||||
Args:
|
||||
pretrained (bool): If True, returns a model pre-trained on ImageNet
|
||||
"""
|
||||
model = ResNet(Bottleneck, [3, 4, 6, 3], **kwargs)
|
||||
|
||||
return model
|
||||
|
||||
|
||||
def resnet101(pretrained=True, **kwargs):
|
||||
"""Constructs a ResNet-101 model.
|
||||
Args:
|
||||
pretrained (bool): If True, returns a model pre-trained on ImageNet
|
||||
"""
|
||||
model = ResNet(Bottleneck, [3, 4, 23, 3], **kwargs)
|
||||
|
||||
return model
|
||||
|
||||
|
||||
def resnet152(pretrained=True, **kwargs):
|
||||
"""Constructs a ResNet-152 model.
|
||||
Args:
|
||||
pretrained (bool): If True, returns a model pre-trained on ImageNet
|
||||
"""
|
||||
model = ResNet(Bottleneck, [3, 8, 36, 3], **kwargs)
|
||||
return model
|
||||
@@ -0,0 +1,237 @@
|
||||
#!/usr/bin/env python
|
||||
# coding: utf-8
|
||||
import torch.nn as nn
|
||||
|
||||
try:
|
||||
from urllib import urlretrieve
|
||||
except ImportError:
|
||||
from urllib.request import urlretrieve
|
||||
|
||||
__all__ = ['resnext101_32x8d']
|
||||
|
||||
|
||||
model_urls = {
|
||||
'resnext50_32x4d': 'https://download.pytorch.org/models/resnext50_32x4d-7cdf4587.pth',
|
||||
'resnext101_32x8d': 'https://download.pytorch.org/models/resnext101_32x8d-8ba56ff5.pth',
|
||||
}
|
||||
|
||||
|
||||
def conv3x3(in_planes, out_planes, stride=1, groups=1, dilation=1):
|
||||
"""3x3 convolution with padding"""
|
||||
return nn.Conv2d(in_planes, out_planes, kernel_size=3, stride=stride,
|
||||
padding=dilation, groups=groups, bias=False, dilation=dilation)
|
||||
|
||||
|
||||
def conv1x1(in_planes, out_planes, stride=1):
|
||||
"""1x1 convolution"""
|
||||
return nn.Conv2d(in_planes, out_planes, kernel_size=1, stride=stride, bias=False)
|
||||
|
||||
|
||||
class BasicBlock(nn.Module):
|
||||
expansion = 1
|
||||
|
||||
def __init__(self, inplanes, planes, stride=1, downsample=None, groups=1,
|
||||
base_width=64, dilation=1, norm_layer=None):
|
||||
super(BasicBlock, self).__init__()
|
||||
if norm_layer is None:
|
||||
norm_layer = nn.BatchNorm2d
|
||||
if groups != 1 or base_width != 64:
|
||||
raise ValueError('BasicBlock only supports groups=1 and base_width=64')
|
||||
if dilation > 1:
|
||||
raise NotImplementedError("Dilation > 1 not supported in BasicBlock")
|
||||
# Both self.conv1 and self.downsample layers downsample the input when stride != 1
|
||||
self.conv1 = conv3x3(inplanes, planes, stride)
|
||||
self.bn1 = norm_layer(planes)
|
||||
self.relu = nn.ReLU(inplace=True)
|
||||
self.conv2 = conv3x3(planes, planes)
|
||||
self.bn2 = norm_layer(planes)
|
||||
self.downsample = downsample
|
||||
self.stride = stride
|
||||
|
||||
def forward(self, x):
|
||||
identity = x
|
||||
|
||||
out = self.conv1(x)
|
||||
out = self.bn1(out)
|
||||
out = self.relu(out)
|
||||
|
||||
out = self.conv2(out)
|
||||
out = self.bn2(out)
|
||||
|
||||
if self.downsample is not None:
|
||||
identity = self.downsample(x)
|
||||
|
||||
out += identity
|
||||
out = self.relu(out)
|
||||
|
||||
return out
|
||||
|
||||
|
||||
class Bottleneck(nn.Module):
|
||||
# Bottleneck in torchvision places the stride for downsampling at 3x3 convolution(self.conv2)
|
||||
# while original implementation places the stride at the first 1x1 convolution(self.conv1)
|
||||
# according to "Deep residual learning for image recognition"https://arxiv.org/abs/1512.03385.
|
||||
# This variant is also known as ResNet V1.5 and improves accuracy according to
|
||||
# https://ngc.nvidia.com/catalog/model-scripts/nvidia:resnet_50_v1_5_for_pytorch.
|
||||
|
||||
expansion = 4
|
||||
|
||||
def __init__(self, inplanes, planes, stride=1, downsample=None, groups=1,
|
||||
base_width=64, dilation=1, norm_layer=None):
|
||||
super(Bottleneck, self).__init__()
|
||||
if norm_layer is None:
|
||||
norm_layer = nn.BatchNorm2d
|
||||
width = int(planes * (base_width / 64.)) * groups
|
||||
# Both self.conv2 and self.downsample layers downsample the input when stride != 1
|
||||
self.conv1 = conv1x1(inplanes, width)
|
||||
self.bn1 = norm_layer(width)
|
||||
self.conv2 = conv3x3(width, width, stride, groups, dilation)
|
||||
self.bn2 = norm_layer(width)
|
||||
self.conv3 = conv1x1(width, planes * self.expansion)
|
||||
self.bn3 = norm_layer(planes * self.expansion)
|
||||
self.relu = nn.ReLU(inplace=True)
|
||||
self.downsample = downsample
|
||||
self.stride = stride
|
||||
|
||||
def forward(self, x):
|
||||
identity = x
|
||||
|
||||
out = self.conv1(x)
|
||||
out = self.bn1(out)
|
||||
out = self.relu(out)
|
||||
|
||||
out = self.conv2(out)
|
||||
out = self.bn2(out)
|
||||
out = self.relu(out)
|
||||
|
||||
out = self.conv3(out)
|
||||
out = self.bn3(out)
|
||||
|
||||
if self.downsample is not None:
|
||||
identity = self.downsample(x)
|
||||
|
||||
out += identity
|
||||
out = self.relu(out)
|
||||
|
||||
return out
|
||||
|
||||
|
||||
class ResNet(nn.Module):
|
||||
|
||||
def __init__(self, block, layers, num_classes=1000, zero_init_residual=False,
|
||||
groups=1, width_per_group=64, replace_stride_with_dilation=None,
|
||||
norm_layer=None):
|
||||
super(ResNet, self).__init__()
|
||||
if norm_layer is None:
|
||||
norm_layer = nn.BatchNorm2d
|
||||
self._norm_layer = norm_layer
|
||||
|
||||
self.inplanes = 64
|
||||
self.dilation = 1
|
||||
if replace_stride_with_dilation is None:
|
||||
# each element in the tuple indicates if we should replace
|
||||
# the 2x2 stride with a dilated convolution instead
|
||||
replace_stride_with_dilation = [False, False, False]
|
||||
if len(replace_stride_with_dilation) != 3:
|
||||
raise ValueError("replace_stride_with_dilation should be None "
|
||||
"or a 3-element tuple, got {}".format(replace_stride_with_dilation))
|
||||
self.groups = groups
|
||||
self.base_width = width_per_group
|
||||
self.conv1 = nn.Conv2d(3, self.inplanes, kernel_size=7, stride=2, padding=3,
|
||||
bias=False)
|
||||
self.bn1 = norm_layer(self.inplanes)
|
||||
self.relu = nn.ReLU(inplace=True)
|
||||
self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1)
|
||||
self.layer1 = self._make_layer(block, 64, layers[0])
|
||||
self.layer2 = self._make_layer(block, 128, layers[1], stride=2,
|
||||
dilate=replace_stride_with_dilation[0])
|
||||
self.layer3 = self._make_layer(block, 256, layers[2], stride=2,
|
||||
dilate=replace_stride_with_dilation[1])
|
||||
self.layer4 = self._make_layer(block, 512, layers[3], stride=2,
|
||||
dilate=replace_stride_with_dilation[2])
|
||||
#self.avgpool = nn.AdaptiveAvgPool2d((1, 1))
|
||||
#self.fc = nn.Linear(512 * block.expansion, num_classes)
|
||||
|
||||
for m in self.modules():
|
||||
if isinstance(m, nn.Conv2d):
|
||||
nn.init.kaiming_normal_(m.weight, mode='fan_out', nonlinearity='relu')
|
||||
elif isinstance(m, (nn.BatchNorm2d, nn.GroupNorm)):
|
||||
nn.init.constant_(m.weight, 1)
|
||||
nn.init.constant_(m.bias, 0)
|
||||
|
||||
# Zero-initialize the last BN in each residual branch,
|
||||
# so that the residual branch starts with zeros, and each residual block behaves like an identity.
|
||||
# This improves the model by 0.2~0.3% according to https://arxiv.org/abs/1706.02677
|
||||
if zero_init_residual:
|
||||
for m in self.modules():
|
||||
if isinstance(m, Bottleneck):
|
||||
nn.init.constant_(m.bn3.weight, 0)
|
||||
elif isinstance(m, BasicBlock):
|
||||
nn.init.constant_(m.bn2.weight, 0)
|
||||
|
||||
def _make_layer(self, block, planes, blocks, stride=1, dilate=False):
|
||||
norm_layer = self._norm_layer
|
||||
downsample = None
|
||||
previous_dilation = self.dilation
|
||||
if dilate:
|
||||
self.dilation *= stride
|
||||
stride = 1
|
||||
if stride != 1 or self.inplanes != planes * block.expansion:
|
||||
downsample = nn.Sequential(
|
||||
conv1x1(self.inplanes, planes * block.expansion, stride),
|
||||
norm_layer(planes * block.expansion),
|
||||
)
|
||||
|
||||
layers = []
|
||||
layers.append(block(self.inplanes, planes, stride, downsample, self.groups,
|
||||
self.base_width, previous_dilation, norm_layer))
|
||||
self.inplanes = planes * block.expansion
|
||||
for _ in range(1, blocks):
|
||||
layers.append(block(self.inplanes, planes, groups=self.groups,
|
||||
base_width=self.base_width, dilation=self.dilation,
|
||||
norm_layer=norm_layer))
|
||||
|
||||
return nn.Sequential(*layers)
|
||||
|
||||
def _forward_impl(self, x):
|
||||
# See note [TorchScript super()]
|
||||
features = []
|
||||
x = self.conv1(x)
|
||||
x = self.bn1(x)
|
||||
x = self.relu(x)
|
||||
x = self.maxpool(x)
|
||||
|
||||
x = self.layer1(x)
|
||||
features.append(x)
|
||||
|
||||
x = self.layer2(x)
|
||||
features.append(x)
|
||||
|
||||
x = self.layer3(x)
|
||||
features.append(x)
|
||||
|
||||
x = self.layer4(x)
|
||||
features.append(x)
|
||||
|
||||
#x = self.avgpool(x)
|
||||
#x = torch.flatten(x, 1)
|
||||
#x = self.fc(x)
|
||||
|
||||
return features
|
||||
|
||||
def forward(self, x):
|
||||
return self._forward_impl(x)
|
||||
|
||||
|
||||
|
||||
def resnext101_32x8d(pretrained=True, **kwargs):
|
||||
"""Constructs a ResNet-152 model.
|
||||
Args:
|
||||
pretrained (bool): If True, returns a model pre-trained on ImageNet
|
||||
"""
|
||||
kwargs['groups'] = 32
|
||||
kwargs['width_per_group'] = 8
|
||||
|
||||
model = ResNet(Bottleneck, [3, 4, 23, 3], **kwargs)
|
||||
return model
|
||||
|
||||
546
extensions-builtin/forge_legacy_preprocessors/annotator/leres/leres/depthmap.py
Executable file
546
extensions-builtin/forge_legacy_preprocessors/annotator/leres/leres/depthmap.py
Executable file
@@ -0,0 +1,546 @@
|
||||
# Author: thygate
|
||||
# https://github.com/thygate/stable-diffusion-webui-depthmap-script
|
||||
|
||||
from modules import devices
|
||||
from modules.shared import opts
|
||||
from torchvision.transforms import transforms
|
||||
from operator import getitem
|
||||
|
||||
import torch, gc
|
||||
import cv2
|
||||
import numpy as np
|
||||
import skimage.measure
|
||||
|
||||
whole_size_threshold = 1600 # R_max from the paper
|
||||
pix2pixsize = 1024
|
||||
|
||||
def scale_torch(img):
|
||||
"""
|
||||
Scale the image and output it in torch.tensor.
|
||||
:param img: input rgb is in shape [H, W, C], input depth/disp is in shape [H, W]
|
||||
:param scale: the scale factor. float
|
||||
:return: img. [C, H, W]
|
||||
"""
|
||||
if len(img.shape) == 2:
|
||||
img = img[np.newaxis, :, :]
|
||||
if img.shape[2] == 3:
|
||||
transform = transforms.Compose([transforms.ToTensor(), transforms.Normalize((0.485, 0.456, 0.406) , (0.229, 0.224, 0.225) )])
|
||||
img = transform(img.astype(np.float32))
|
||||
else:
|
||||
img = img.astype(np.float32)
|
||||
img = torch.from_numpy(img)
|
||||
return img
|
||||
|
||||
def estimateleres(img, model, w, h):
|
||||
# leres transform input
|
||||
rgb_c = img[:, :, ::-1].copy()
|
||||
A_resize = cv2.resize(rgb_c, (w, h))
|
||||
img_torch = scale_torch(A_resize)[None, :, :, :]
|
||||
|
||||
# compute
|
||||
with torch.no_grad():
|
||||
img_torch = img_torch.to(devices.get_device_for("controlnet"))
|
||||
prediction = model.depth_model(img_torch)
|
||||
|
||||
prediction = prediction.squeeze().cpu().numpy()
|
||||
prediction = cv2.resize(prediction, (img.shape[1], img.shape[0]), interpolation=cv2.INTER_CUBIC)
|
||||
|
||||
return prediction
|
||||
|
||||
def generatemask(size):
|
||||
# Generates a Guassian mask
|
||||
mask = np.zeros(size, dtype=np.float32)
|
||||
sigma = int(size[0]/16)
|
||||
k_size = int(2 * np.ceil(2 * int(size[0]/16)) + 1)
|
||||
mask[int(0.15*size[0]):size[0] - int(0.15*size[0]), int(0.15*size[1]): size[1] - int(0.15*size[1])] = 1
|
||||
mask = cv2.GaussianBlur(mask, (int(k_size), int(k_size)), sigma)
|
||||
mask = (mask - mask.min()) / (mask.max() - mask.min())
|
||||
mask = mask.astype(np.float32)
|
||||
return mask
|
||||
|
||||
def resizewithpool(img, size):
|
||||
i_size = img.shape[0]
|
||||
n = int(np.floor(i_size/size))
|
||||
|
||||
out = skimage.measure.block_reduce(img, (n, n), np.max)
|
||||
return out
|
||||
|
||||
def rgb2gray(rgb):
|
||||
# Converts rgb to gray
|
||||
return np.dot(rgb[..., :3], [0.2989, 0.5870, 0.1140])
|
||||
|
||||
def calculateprocessingres(img, basesize, confidence=0.1, scale_threshold=3, whole_size_threshold=3000):
|
||||
# Returns the R_x resolution described in section 5 of the main paper.
|
||||
|
||||
# Parameters:
|
||||
# img :input rgb image
|
||||
# basesize : size the dilation kernel which is equal to receptive field of the network.
|
||||
# confidence: value of x in R_x; allowed percentage of pixels that are not getting any contextual cue.
|
||||
# scale_threshold: maximum allowed upscaling on the input image ; it has been set to 3.
|
||||
# whole_size_threshold: maximum allowed resolution. (R_max from section 6 of the main paper)
|
||||
|
||||
# Returns:
|
||||
# outputsize_scale*speed_scale :The computed R_x resolution
|
||||
# patch_scale: K parameter from section 6 of the paper
|
||||
|
||||
# speed scale parameter is to process every image in a smaller size to accelerate the R_x resolution search
|
||||
speed_scale = 32
|
||||
image_dim = int(min(img.shape[0:2]))
|
||||
|
||||
gray = rgb2gray(img)
|
||||
grad = np.abs(cv2.Sobel(gray, cv2.CV_64F, 0, 1, ksize=3)) + np.abs(cv2.Sobel(gray, cv2.CV_64F, 1, 0, ksize=3))
|
||||
grad = cv2.resize(grad, (image_dim, image_dim), cv2.INTER_AREA)
|
||||
|
||||
# thresholding the gradient map to generate the edge-map as a proxy of the contextual cues
|
||||
m = grad.min()
|
||||
M = grad.max()
|
||||
middle = m + (0.4 * (M - m))
|
||||
grad[grad < middle] = 0
|
||||
grad[grad >= middle] = 1
|
||||
|
||||
# dilation kernel with size of the receptive field
|
||||
kernel = np.ones((int(basesize/speed_scale), int(basesize/speed_scale)), float)
|
||||
# dilation kernel with size of the a quarter of receptive field used to compute k
|
||||
# as described in section 6 of main paper
|
||||
kernel2 = np.ones((int(basesize / (4*speed_scale)), int(basesize / (4*speed_scale))), float)
|
||||
|
||||
# Output resolution limit set by the whole_size_threshold and scale_threshold.
|
||||
threshold = min(whole_size_threshold, scale_threshold * max(img.shape[:2]))
|
||||
|
||||
outputsize_scale = basesize / speed_scale
|
||||
for p_size in range(int(basesize/speed_scale), int(threshold/speed_scale), int(basesize / (2*speed_scale))):
|
||||
grad_resized = resizewithpool(grad, p_size)
|
||||
grad_resized = cv2.resize(grad_resized, (p_size, p_size), cv2.INTER_NEAREST)
|
||||
grad_resized[grad_resized >= 0.5] = 1
|
||||
grad_resized[grad_resized < 0.5] = 0
|
||||
|
||||
dilated = cv2.dilate(grad_resized, kernel, iterations=1)
|
||||
meanvalue = (1-dilated).mean()
|
||||
if meanvalue > confidence:
|
||||
break
|
||||
else:
|
||||
outputsize_scale = p_size
|
||||
|
||||
grad_region = cv2.dilate(grad_resized, kernel2, iterations=1)
|
||||
patch_scale = grad_region.mean()
|
||||
|
||||
return int(outputsize_scale*speed_scale), patch_scale
|
||||
|
||||
# Generate a double-input depth estimation
|
||||
def doubleestimate(img, size1, size2, pix2pixsize, model, net_type, pix2pixmodel):
|
||||
# Generate the low resolution estimation
|
||||
estimate1 = singleestimate(img, size1, model, net_type)
|
||||
# Resize to the inference size of merge network.
|
||||
estimate1 = cv2.resize(estimate1, (pix2pixsize, pix2pixsize), interpolation=cv2.INTER_CUBIC)
|
||||
|
||||
# Generate the high resolution estimation
|
||||
estimate2 = singleestimate(img, size2, model, net_type)
|
||||
# Resize to the inference size of merge network.
|
||||
estimate2 = cv2.resize(estimate2, (pix2pixsize, pix2pixsize), interpolation=cv2.INTER_CUBIC)
|
||||
|
||||
# Inference on the merge model
|
||||
pix2pixmodel.set_input(estimate1, estimate2)
|
||||
pix2pixmodel.test()
|
||||
visuals = pix2pixmodel.get_current_visuals()
|
||||
prediction_mapped = visuals['fake_B']
|
||||
prediction_mapped = (prediction_mapped+1)/2
|
||||
prediction_mapped = (prediction_mapped - torch.min(prediction_mapped)) / (
|
||||
torch.max(prediction_mapped) - torch.min(prediction_mapped))
|
||||
prediction_mapped = prediction_mapped.squeeze().cpu().numpy()
|
||||
|
||||
return prediction_mapped
|
||||
|
||||
# Generate a single-input depth estimation
|
||||
def singleestimate(img, msize, model, net_type):
|
||||
# if net_type == 0:
|
||||
return estimateleres(img, model, msize, msize)
|
||||
# else:
|
||||
# return estimatemidasBoost(img, model, msize, msize)
|
||||
|
||||
def applyGridpatch(blsize, stride, img, box):
|
||||
# Extract a simple grid patch.
|
||||
counter1 = 0
|
||||
patch_bound_list = {}
|
||||
for k in range(blsize, img.shape[1] - blsize, stride):
|
||||
for j in range(blsize, img.shape[0] - blsize, stride):
|
||||
patch_bound_list[str(counter1)] = {}
|
||||
patchbounds = [j - blsize, k - blsize, j - blsize + 2 * blsize, k - blsize + 2 * blsize]
|
||||
patch_bound = [box[0] + patchbounds[1], box[1] + patchbounds[0], patchbounds[3] - patchbounds[1],
|
||||
patchbounds[2] - patchbounds[0]]
|
||||
patch_bound_list[str(counter1)]['rect'] = patch_bound
|
||||
patch_bound_list[str(counter1)]['size'] = patch_bound[2]
|
||||
counter1 = counter1 + 1
|
||||
return patch_bound_list
|
||||
|
||||
# Generating local patches to perform the local refinement described in section 6 of the main paper.
|
||||
def generatepatchs(img, base_size):
|
||||
|
||||
# Compute the gradients as a proxy of the contextual cues.
|
||||
img_gray = rgb2gray(img)
|
||||
whole_grad = np.abs(cv2.Sobel(img_gray, cv2.CV_64F, 0, 1, ksize=3)) +\
|
||||
np.abs(cv2.Sobel(img_gray, cv2.CV_64F, 1, 0, ksize=3))
|
||||
|
||||
threshold = whole_grad[whole_grad > 0].mean()
|
||||
whole_grad[whole_grad < threshold] = 0
|
||||
|
||||
# We use the integral image to speed-up the evaluation of the amount of gradients for each patch.
|
||||
gf = whole_grad.sum()/len(whole_grad.reshape(-1))
|
||||
grad_integral_image = cv2.integral(whole_grad)
|
||||
|
||||
# Variables are selected such that the initial patch size would be the receptive field size
|
||||
# and the stride is set to 1/3 of the receptive field size.
|
||||
blsize = int(round(base_size/2))
|
||||
stride = int(round(blsize*0.75))
|
||||
|
||||
# Get initial Grid
|
||||
patch_bound_list = applyGridpatch(blsize, stride, img, [0, 0, 0, 0])
|
||||
|
||||
# Refine initial Grid of patches by discarding the flat (in terms of gradients of the rgb image) ones. Refine
|
||||
# each patch size to ensure that there will be enough depth cues for the network to generate a consistent depth map.
|
||||
print("Selecting patches ...")
|
||||
patch_bound_list = adaptiveselection(grad_integral_image, patch_bound_list, gf)
|
||||
|
||||
# Sort the patch list to make sure the merging operation will be done with the correct order: starting from biggest
|
||||
# patch
|
||||
patchset = sorted(patch_bound_list.items(), key=lambda x: getitem(x[1], 'size'), reverse=True)
|
||||
return patchset
|
||||
|
||||
def getGF_fromintegral(integralimage, rect):
|
||||
# Computes the gradient density of a given patch from the gradient integral image.
|
||||
x1 = rect[1]
|
||||
x2 = rect[1]+rect[3]
|
||||
y1 = rect[0]
|
||||
y2 = rect[0]+rect[2]
|
||||
value = integralimage[x2, y2]-integralimage[x1, y2]-integralimage[x2, y1]+integralimage[x1, y1]
|
||||
return value
|
||||
|
||||
# Adaptively select patches
|
||||
def adaptiveselection(integral_grad, patch_bound_list, gf):
|
||||
patchlist = {}
|
||||
count = 0
|
||||
height, width = integral_grad.shape
|
||||
|
||||
search_step = int(32/factor)
|
||||
|
||||
# Go through all patches
|
||||
for c in range(len(patch_bound_list)):
|
||||
# Get patch
|
||||
bbox = patch_bound_list[str(c)]['rect']
|
||||
|
||||
# Compute the amount of gradients present in the patch from the integral image.
|
||||
cgf = getGF_fromintegral(integral_grad, bbox)/(bbox[2]*bbox[3])
|
||||
|
||||
# Check if patching is beneficial by comparing the gradient density of the patch to
|
||||
# the gradient density of the whole image
|
||||
if cgf >= gf:
|
||||
bbox_test = bbox.copy()
|
||||
patchlist[str(count)] = {}
|
||||
|
||||
# Enlarge each patch until the gradient density of the patch is equal
|
||||
# to the whole image gradient density
|
||||
while True:
|
||||
|
||||
bbox_test[0] = bbox_test[0] - int(search_step/2)
|
||||
bbox_test[1] = bbox_test[1] - int(search_step/2)
|
||||
|
||||
bbox_test[2] = bbox_test[2] + search_step
|
||||
bbox_test[3] = bbox_test[3] + search_step
|
||||
|
||||
# Check if we are still within the image
|
||||
if bbox_test[0] < 0 or bbox_test[1] < 0 or bbox_test[1] + bbox_test[3] >= height \
|
||||
or bbox_test[0] + bbox_test[2] >= width:
|
||||
break
|
||||
|
||||
# Compare gradient density
|
||||
cgf = getGF_fromintegral(integral_grad, bbox_test)/(bbox_test[2]*bbox_test[3])
|
||||
if cgf < gf:
|
||||
break
|
||||
bbox = bbox_test.copy()
|
||||
|
||||
# Add patch to selected patches
|
||||
patchlist[str(count)]['rect'] = bbox
|
||||
patchlist[str(count)]['size'] = bbox[2]
|
||||
count = count + 1
|
||||
|
||||
# Return selected patches
|
||||
return patchlist
|
||||
|
||||
def impatch(image, rect):
|
||||
# Extract the given patch pixels from a given image.
|
||||
w1 = rect[0]
|
||||
h1 = rect[1]
|
||||
w2 = w1 + rect[2]
|
||||
h2 = h1 + rect[3]
|
||||
image_patch = image[h1:h2, w1:w2]
|
||||
return image_patch
|
||||
|
||||
class ImageandPatchs:
|
||||
def __init__(self, root_dir, name, patchsinfo, rgb_image, scale=1):
|
||||
self.root_dir = root_dir
|
||||
self.patchsinfo = patchsinfo
|
||||
self.name = name
|
||||
self.patchs = patchsinfo
|
||||
self.scale = scale
|
||||
|
||||
self.rgb_image = cv2.resize(rgb_image, (round(rgb_image.shape[1]*scale), round(rgb_image.shape[0]*scale)),
|
||||
interpolation=cv2.INTER_CUBIC)
|
||||
|
||||
self.do_have_estimate = False
|
||||
self.estimation_updated_image = None
|
||||
self.estimation_base_image = None
|
||||
|
||||
def __len__(self):
|
||||
return len(self.patchs)
|
||||
|
||||
def set_base_estimate(self, est):
|
||||
self.estimation_base_image = est
|
||||
if self.estimation_updated_image is not None:
|
||||
self.do_have_estimate = True
|
||||
|
||||
def set_updated_estimate(self, est):
|
||||
self.estimation_updated_image = est
|
||||
if self.estimation_base_image is not None:
|
||||
self.do_have_estimate = True
|
||||
|
||||
def __getitem__(self, index):
|
||||
patch_id = int(self.patchs[index][0])
|
||||
rect = np.array(self.patchs[index][1]['rect'])
|
||||
msize = self.patchs[index][1]['size']
|
||||
|
||||
## applying scale to rect:
|
||||
rect = np.round(rect * self.scale)
|
||||
rect = rect.astype('int')
|
||||
msize = round(msize * self.scale)
|
||||
|
||||
patch_rgb = impatch(self.rgb_image, rect)
|
||||
if self.do_have_estimate:
|
||||
patch_whole_estimate_base = impatch(self.estimation_base_image, rect)
|
||||
patch_whole_estimate_updated = impatch(self.estimation_updated_image, rect)
|
||||
return {'patch_rgb': patch_rgb, 'patch_whole_estimate_base': patch_whole_estimate_base,
|
||||
'patch_whole_estimate_updated': patch_whole_estimate_updated, 'rect': rect,
|
||||
'size': msize, 'id': patch_id}
|
||||
else:
|
||||
return {'patch_rgb': patch_rgb, 'rect': rect, 'size': msize, 'id': patch_id}
|
||||
|
||||
def print_options(self, opt):
|
||||
"""Print and save options
|
||||
|
||||
It will print both current options and default values(if different).
|
||||
It will save options into a text file / [checkpoints_dir] / opt.txt
|
||||
"""
|
||||
message = ''
|
||||
message += '----------------- Options ---------------\n'
|
||||
for k, v in sorted(vars(opt).items()):
|
||||
comment = ''
|
||||
default = self.parser.get_default(k)
|
||||
if v != default:
|
||||
comment = '\t[default: %s]' % str(default)
|
||||
message += '{:>25}: {:<30}{}\n'.format(str(k), str(v), comment)
|
||||
message += '----------------- End -------------------'
|
||||
print(message)
|
||||
|
||||
# save to the disk
|
||||
"""
|
||||
expr_dir = os.path.join(opt.checkpoints_dir, opt.name)
|
||||
util.mkdirs(expr_dir)
|
||||
file_name = os.path.join(expr_dir, '{}_opt.txt'.format(opt.phase))
|
||||
with open(file_name, 'wt') as opt_file:
|
||||
opt_file.write(message)
|
||||
opt_file.write('\n')
|
||||
"""
|
||||
|
||||
def parse(self):
|
||||
"""Parse our options, create checkpoints directory suffix, and set up gpu device."""
|
||||
opt = self.gather_options()
|
||||
opt.isTrain = self.isTrain # train or test
|
||||
|
||||
# process opt.suffix
|
||||
if opt.suffix:
|
||||
suffix = ('_' + opt.suffix.format(**vars(opt))) if opt.suffix != '' else ''
|
||||
opt.name = opt.name + suffix
|
||||
|
||||
#self.print_options(opt)
|
||||
|
||||
# set gpu ids
|
||||
str_ids = opt.gpu_ids.split(',')
|
||||
opt.gpu_ids = []
|
||||
for str_id in str_ids:
|
||||
id = int(str_id)
|
||||
if id >= 0:
|
||||
opt.gpu_ids.append(id)
|
||||
#if len(opt.gpu_ids) > 0:
|
||||
# torch.cuda.set_device(opt.gpu_ids[0])
|
||||
|
||||
self.opt = opt
|
||||
return self.opt
|
||||
|
||||
|
||||
def estimateboost(img, model, model_type, pix2pixmodel, max_res=512):
|
||||
global whole_size_threshold
|
||||
|
||||
# get settings
|
||||
if hasattr(opts, 'depthmap_script_boost_rmax'):
|
||||
whole_size_threshold = opts.depthmap_script_boost_rmax
|
||||
|
||||
if model_type == 0: #leres
|
||||
net_receptive_field_size = 448
|
||||
patch_netsize = 2 * net_receptive_field_size
|
||||
elif model_type == 1: #dpt_beit_large_512
|
||||
net_receptive_field_size = 512
|
||||
patch_netsize = 2 * net_receptive_field_size
|
||||
else: #other midas
|
||||
net_receptive_field_size = 384
|
||||
patch_netsize = 2 * net_receptive_field_size
|
||||
|
||||
gc.collect()
|
||||
devices.torch_gc()
|
||||
|
||||
# Generate mask used to smoothly blend the local pathc estimations to the base estimate.
|
||||
# It is arbitrarily large to avoid artifacts during rescaling for each crop.
|
||||
mask_org = generatemask((3000, 3000))
|
||||
mask = mask_org.copy()
|
||||
|
||||
# Value x of R_x defined in the section 5 of the main paper.
|
||||
r_threshold_value = 0.2
|
||||
#if R0:
|
||||
# r_threshold_value = 0
|
||||
|
||||
input_resolution = img.shape
|
||||
scale_threshold = 3 # Allows up-scaling with a scale up to 3
|
||||
|
||||
# Find the best input resolution R-x. The resolution search described in section 5-double estimation of the main paper and section B of the
|
||||
# supplementary material.
|
||||
whole_image_optimal_size, patch_scale = calculateprocessingres(img, net_receptive_field_size, r_threshold_value, scale_threshold, whole_size_threshold)
|
||||
|
||||
# print('wholeImage being processed in :', whole_image_optimal_size)
|
||||
|
||||
# Generate the base estimate using the double estimation.
|
||||
whole_estimate = doubleestimate(img, net_receptive_field_size, whole_image_optimal_size, pix2pixsize, model, model_type, pix2pixmodel)
|
||||
|
||||
# Compute the multiplier described in section 6 of the main paper to make sure our initial patch can select
|
||||
# small high-density regions of the image.
|
||||
global factor
|
||||
factor = max(min(1, 4 * patch_scale * whole_image_optimal_size / whole_size_threshold), 0.2)
|
||||
# print('Adjust factor is:', 1/factor)
|
||||
|
||||
# Check if Local boosting is beneficial.
|
||||
if max_res < whole_image_optimal_size:
|
||||
# print("No Local boosting. Specified Max Res is smaller than R20, Returning doubleestimate result")
|
||||
return cv2.resize(whole_estimate, (input_resolution[1], input_resolution[0]), interpolation=cv2.INTER_CUBIC)
|
||||
|
||||
# Compute the default target resolution.
|
||||
if img.shape[0] > img.shape[1]:
|
||||
a = 2 * whole_image_optimal_size
|
||||
b = round(2 * whole_image_optimal_size * img.shape[1] / img.shape[0])
|
||||
else:
|
||||
a = round(2 * whole_image_optimal_size * img.shape[0] / img.shape[1])
|
||||
b = 2 * whole_image_optimal_size
|
||||
b = int(round(b / factor))
|
||||
a = int(round(a / factor))
|
||||
|
||||
"""
|
||||
# recompute a, b and saturate to max res.
|
||||
if max(a,b) > max_res:
|
||||
print('Default Res is higher than max-res: Reducing final resolution')
|
||||
if img.shape[0] > img.shape[1]:
|
||||
a = max_res
|
||||
b = round(max_res * img.shape[1] / img.shape[0])
|
||||
else:
|
||||
a = round(max_res * img.shape[0] / img.shape[1])
|
||||
b = max_res
|
||||
b = int(b)
|
||||
a = int(a)
|
||||
"""
|
||||
|
||||
img = cv2.resize(img, (b, a), interpolation=cv2.INTER_CUBIC)
|
||||
|
||||
# Extract selected patches for local refinement
|
||||
base_size = net_receptive_field_size * 2
|
||||
patchset = generatepatchs(img, base_size)
|
||||
|
||||
# print('Target resolution: ', img.shape)
|
||||
|
||||
# Computing a scale in case user prompted to generate the results as the same resolution of the input.
|
||||
# Notice that our method output resolution is independent of the input resolution and this parameter will only
|
||||
# enable a scaling operation during the local patch merge implementation to generate results with the same resolution
|
||||
# as the input.
|
||||
"""
|
||||
if output_resolution == 1:
|
||||
mergein_scale = input_resolution[0] / img.shape[0]
|
||||
print('Dynamicly change merged-in resolution; scale:', mergein_scale)
|
||||
else:
|
||||
mergein_scale = 1
|
||||
"""
|
||||
# always rescale to input res for now
|
||||
mergein_scale = input_resolution[0] / img.shape[0]
|
||||
|
||||
imageandpatchs = ImageandPatchs('', '', patchset, img, mergein_scale)
|
||||
whole_estimate_resized = cv2.resize(whole_estimate, (round(img.shape[1]*mergein_scale),
|
||||
round(img.shape[0]*mergein_scale)), interpolation=cv2.INTER_CUBIC)
|
||||
imageandpatchs.set_base_estimate(whole_estimate_resized.copy())
|
||||
imageandpatchs.set_updated_estimate(whole_estimate_resized.copy())
|
||||
|
||||
print('Resulting depthmap resolution will be :', whole_estimate_resized.shape[:2])
|
||||
print('Patches to process: '+str(len(imageandpatchs)))
|
||||
|
||||
# Enumerate through all patches, generate their estimations and refining the base estimate.
|
||||
for patch_ind in range(len(imageandpatchs)):
|
||||
|
||||
# Get patch information
|
||||
patch = imageandpatchs[patch_ind] # patch object
|
||||
patch_rgb = patch['patch_rgb'] # rgb patch
|
||||
patch_whole_estimate_base = patch['patch_whole_estimate_base'] # corresponding patch from base
|
||||
rect = patch['rect'] # patch size and location
|
||||
patch_id = patch['id'] # patch ID
|
||||
org_size = patch_whole_estimate_base.shape # the original size from the unscaled input
|
||||
print('\t Processing patch', patch_ind, '/', len(imageandpatchs)-1, '|', rect)
|
||||
|
||||
# We apply double estimation for patches. The high resolution value is fixed to twice the receptive
|
||||
# field size of the network for patches to accelerate the process.
|
||||
patch_estimation = doubleestimate(patch_rgb, net_receptive_field_size, patch_netsize, pix2pixsize, model, model_type, pix2pixmodel)
|
||||
patch_estimation = cv2.resize(patch_estimation, (pix2pixsize, pix2pixsize), interpolation=cv2.INTER_CUBIC)
|
||||
patch_whole_estimate_base = cv2.resize(patch_whole_estimate_base, (pix2pixsize, pix2pixsize), interpolation=cv2.INTER_CUBIC)
|
||||
|
||||
# Merging the patch estimation into the base estimate using our merge network:
|
||||
# We feed the patch estimation and the same region from the updated base estimate to the merge network
|
||||
# to generate the target estimate for the corresponding region.
|
||||
pix2pixmodel.set_input(patch_whole_estimate_base, patch_estimation)
|
||||
|
||||
# Run merging network
|
||||
pix2pixmodel.test()
|
||||
visuals = pix2pixmodel.get_current_visuals()
|
||||
|
||||
prediction_mapped = visuals['fake_B']
|
||||
prediction_mapped = (prediction_mapped+1)/2
|
||||
prediction_mapped = prediction_mapped.squeeze().cpu().numpy()
|
||||
|
||||
mapped = prediction_mapped
|
||||
|
||||
# We use a simple linear polynomial to make sure the result of the merge network would match the values of
|
||||
# base estimate
|
||||
p_coef = np.polyfit(mapped.reshape(-1), patch_whole_estimate_base.reshape(-1), deg=1)
|
||||
merged = np.polyval(p_coef, mapped.reshape(-1)).reshape(mapped.shape)
|
||||
|
||||
merged = cv2.resize(merged, (org_size[1],org_size[0]), interpolation=cv2.INTER_CUBIC)
|
||||
|
||||
# Get patch size and location
|
||||
w1 = rect[0]
|
||||
h1 = rect[1]
|
||||
w2 = w1 + rect[2]
|
||||
h2 = h1 + rect[3]
|
||||
|
||||
# To speed up the implementation, we only generate the Gaussian mask once with a sufficiently large size
|
||||
# and resize it to our needed size while merging the patches.
|
||||
if mask.shape != org_size:
|
||||
mask = cv2.resize(mask_org, (org_size[1],org_size[0]), interpolation=cv2.INTER_LINEAR)
|
||||
|
||||
tobemergedto = imageandpatchs.estimation_updated_image
|
||||
|
||||
# Update the whole estimation:
|
||||
# We use a simple Gaussian mask to blend the merged patch region with the base estimate to ensure seamless
|
||||
# blending at the boundaries of the patch region.
|
||||
tobemergedto[h1:h2, w1:w2] = np.multiply(tobemergedto[h1:h2, w1:w2], 1 - mask) + np.multiply(merged, mask)
|
||||
imageandpatchs.set_updated_estimate(tobemergedto)
|
||||
|
||||
# output
|
||||
return cv2.resize(imageandpatchs.estimation_updated_image, (input_resolution[1], input_resolution[0]), interpolation=cv2.INTER_CUBIC)
|
||||
@@ -0,0 +1,34 @@
|
||||
from . import network_auxi as network
|
||||
from .net_tools import get_func
|
||||
import torch
|
||||
import torch.nn as nn
|
||||
from modules import devices
|
||||
|
||||
class RelDepthModel(nn.Module):
|
||||
def __init__(self, backbone='resnet50'):
|
||||
super(RelDepthModel, self).__init__()
|
||||
if backbone == 'resnet50':
|
||||
encoder = 'resnet50_stride32'
|
||||
elif backbone == 'resnext101':
|
||||
encoder = 'resnext101_stride32x8d'
|
||||
self.depth_model = DepthModel(encoder)
|
||||
|
||||
def inference(self, rgb):
|
||||
with torch.no_grad():
|
||||
input = rgb.to(self.depth_model.device)
|
||||
depth = self.depth_model(input)
|
||||
#pred_depth_out = depth - depth.min() + 0.01
|
||||
return depth #pred_depth_out
|
||||
|
||||
|
||||
class DepthModel(nn.Module):
|
||||
def __init__(self, encoder):
|
||||
super(DepthModel, self).__init__()
|
||||
backbone = network.__name__.split('.')[-1] + '.' + encoder
|
||||
self.encoder_modules = get_func(backbone)()
|
||||
self.decoder_modules = network.Decoder()
|
||||
|
||||
def forward(self, x):
|
||||
lateral_out = self.encoder_modules(x)
|
||||
out_logit = self.decoder_modules(lateral_out)
|
||||
return out_logit
|
||||
@@ -0,0 +1,54 @@
|
||||
import importlib
|
||||
import torch
|
||||
import os
|
||||
from collections import OrderedDict
|
||||
|
||||
|
||||
def get_func(func_name):
|
||||
"""Helper to return a function object by name. func_name must identify a
|
||||
function in this module or the path to a function relative to the base
|
||||
'modeling' module.
|
||||
"""
|
||||
if func_name == '':
|
||||
return None
|
||||
try:
|
||||
parts = func_name.split('.')
|
||||
# Refers to a function in this module
|
||||
if len(parts) == 1:
|
||||
return globals()[parts[0]]
|
||||
# Otherwise, assume we're referencing a module under modeling
|
||||
module_name = 'annotator.leres.leres.' + '.'.join(parts[:-1])
|
||||
module = importlib.import_module(module_name)
|
||||
return getattr(module, parts[-1])
|
||||
except Exception:
|
||||
print('Failed to f1ind function: %s', func_name)
|
||||
raise
|
||||
|
||||
def load_ckpt(args, depth_model, shift_model, focal_model):
|
||||
"""
|
||||
Load checkpoint.
|
||||
"""
|
||||
if os.path.isfile(args.load_ckpt):
|
||||
print("loading checkpoint %s" % args.load_ckpt)
|
||||
checkpoint = torch.load(args.load_ckpt)
|
||||
if shift_model is not None:
|
||||
shift_model.load_state_dict(strip_prefix_if_present(checkpoint['shift_model'], 'module.'),
|
||||
strict=True)
|
||||
if focal_model is not None:
|
||||
focal_model.load_state_dict(strip_prefix_if_present(checkpoint['focal_model'], 'module.'),
|
||||
strict=True)
|
||||
depth_model.load_state_dict(strip_prefix_if_present(checkpoint['depth_model'], "module."),
|
||||
strict=True)
|
||||
del checkpoint
|
||||
if torch.cuda.is_available():
|
||||
torch.cuda.empty_cache()
|
||||
|
||||
|
||||
def strip_prefix_if_present(state_dict, prefix):
|
||||
keys = sorted(state_dict.keys())
|
||||
if not all(key.startswith(prefix) for key in keys):
|
||||
return state_dict
|
||||
stripped_state_dict = OrderedDict()
|
||||
for key, value in state_dict.items():
|
||||
stripped_state_dict[key.replace(prefix, "")] = value
|
||||
return stripped_state_dict
|
||||
@@ -0,0 +1,417 @@
|
||||
import torch
|
||||
import torch.nn as nn
|
||||
import torch.nn.init as init
|
||||
|
||||
from . import Resnet, Resnext_torch
|
||||
|
||||
|
||||
def resnet50_stride32():
|
||||
return DepthNet(backbone='resnet', depth=50, upfactors=[2, 2, 2, 2])
|
||||
|
||||
def resnext101_stride32x8d():
|
||||
return DepthNet(backbone='resnext101_32x8d', depth=101, upfactors=[2, 2, 2, 2])
|
||||
|
||||
|
||||
class Decoder(nn.Module):
|
||||
def __init__(self):
|
||||
super(Decoder, self).__init__()
|
||||
self.inchannels = [256, 512, 1024, 2048]
|
||||
self.midchannels = [256, 256, 256, 512]
|
||||
self.upfactors = [2,2,2,2]
|
||||
self.outchannels = 1
|
||||
|
||||
self.conv = FTB(inchannels=self.inchannels[3], midchannels=self.midchannels[3])
|
||||
self.conv1 = nn.Conv2d(in_channels=self.midchannels[3], out_channels=self.midchannels[2], kernel_size=3, padding=1, stride=1, bias=True)
|
||||
self.upsample = nn.Upsample(scale_factor=self.upfactors[3], mode='bilinear', align_corners=True)
|
||||
|
||||
self.ffm2 = FFM(inchannels=self.inchannels[2], midchannels=self.midchannels[2], outchannels = self.midchannels[2], upfactor=self.upfactors[2])
|
||||
self.ffm1 = FFM(inchannels=self.inchannels[1], midchannels=self.midchannels[1], outchannels = self.midchannels[1], upfactor=self.upfactors[1])
|
||||
self.ffm0 = FFM(inchannels=self.inchannels[0], midchannels=self.midchannels[0], outchannels = self.midchannels[0], upfactor=self.upfactors[0])
|
||||
|
||||
self.outconv = AO(inchannels=self.midchannels[0], outchannels=self.outchannels, upfactor=2)
|
||||
self._init_params()
|
||||
|
||||
def _init_params(self):
|
||||
for m in self.modules():
|
||||
if isinstance(m, nn.Conv2d):
|
||||
init.normal_(m.weight, std=0.01)
|
||||
if m.bias is not None:
|
||||
init.constant_(m.bias, 0)
|
||||
elif isinstance(m, nn.ConvTranspose2d):
|
||||
init.normal_(m.weight, std=0.01)
|
||||
if m.bias is not None:
|
||||
init.constant_(m.bias, 0)
|
||||
elif isinstance(m, nn.BatchNorm2d): #NN.BatchNorm2d
|
||||
init.constant_(m.weight, 1)
|
||||
init.constant_(m.bias, 0)
|
||||
elif isinstance(m, nn.Linear):
|
||||
init.normal_(m.weight, std=0.01)
|
||||
if m.bias is not None:
|
||||
init.constant_(m.bias, 0)
|
||||
|
||||
def forward(self, features):
|
||||
x_32x = self.conv(features[3]) # 1/32
|
||||
x_32 = self.conv1(x_32x)
|
||||
x_16 = self.upsample(x_32) # 1/16
|
||||
|
||||
x_8 = self.ffm2(features[2], x_16) # 1/8
|
||||
x_4 = self.ffm1(features[1], x_8) # 1/4
|
||||
x_2 = self.ffm0(features[0], x_4) # 1/2
|
||||
#-----------------------------------------
|
||||
x = self.outconv(x_2) # original size
|
||||
return x
|
||||
|
||||
class DepthNet(nn.Module):
|
||||
__factory = {
|
||||
18: Resnet.resnet18,
|
||||
34: Resnet.resnet34,
|
||||
50: Resnet.resnet50,
|
||||
101: Resnet.resnet101,
|
||||
152: Resnet.resnet152
|
||||
}
|
||||
def __init__(self,
|
||||
backbone='resnet',
|
||||
depth=50,
|
||||
upfactors=[2, 2, 2, 2]):
|
||||
super(DepthNet, self).__init__()
|
||||
self.backbone = backbone
|
||||
self.depth = depth
|
||||
self.pretrained = False
|
||||
self.inchannels = [256, 512, 1024, 2048]
|
||||
self.midchannels = [256, 256, 256, 512]
|
||||
self.upfactors = upfactors
|
||||
self.outchannels = 1
|
||||
|
||||
# Build model
|
||||
if self.backbone == 'resnet':
|
||||
if self.depth not in DepthNet.__factory:
|
||||
raise KeyError("Unsupported depth:", self.depth)
|
||||
self.encoder = DepthNet.__factory[depth](pretrained=self.pretrained)
|
||||
elif self.backbone == 'resnext101_32x8d':
|
||||
self.encoder = Resnext_torch.resnext101_32x8d(pretrained=self.pretrained)
|
||||
else:
|
||||
self.encoder = Resnext_torch.resnext101(pretrained=self.pretrained)
|
||||
|
||||
def forward(self, x):
|
||||
x = self.encoder(x) # 1/32, 1/16, 1/8, 1/4
|
||||
return x
|
||||
|
||||
|
||||
class FTB(nn.Module):
|
||||
def __init__(self, inchannels, midchannels=512):
|
||||
super(FTB, self).__init__()
|
||||
self.in1 = inchannels
|
||||
self.mid = midchannels
|
||||
self.conv1 = nn.Conv2d(in_channels=self.in1, out_channels=self.mid, kernel_size=3, padding=1, stride=1,
|
||||
bias=True)
|
||||
# NN.BatchNorm2d
|
||||
self.conv_branch = nn.Sequential(nn.ReLU(inplace=True), \
|
||||
nn.Conv2d(in_channels=self.mid, out_channels=self.mid, kernel_size=3,
|
||||
padding=1, stride=1, bias=True), \
|
||||
nn.BatchNorm2d(num_features=self.mid), \
|
||||
nn.ReLU(inplace=True), \
|
||||
nn.Conv2d(in_channels=self.mid, out_channels=self.mid, kernel_size=3,
|
||||
padding=1, stride=1, bias=True))
|
||||
self.relu = nn.ReLU(inplace=True)
|
||||
|
||||
self.init_params()
|
||||
|
||||
def forward(self, x):
|
||||
x = self.conv1(x)
|
||||
x = x + self.conv_branch(x)
|
||||
x = self.relu(x)
|
||||
|
||||
return x
|
||||
|
||||
def init_params(self):
|
||||
for m in self.modules():
|
||||
if isinstance(m, nn.Conv2d):
|
||||
init.normal_(m.weight, std=0.01)
|
||||
if m.bias is not None:
|
||||
init.constant_(m.bias, 0)
|
||||
elif isinstance(m, nn.ConvTranspose2d):
|
||||
# init.kaiming_normal_(m.weight, mode='fan_out')
|
||||
init.normal_(m.weight, std=0.01)
|
||||
# init.xavier_normal_(m.weight)
|
||||
if m.bias is not None:
|
||||
init.constant_(m.bias, 0)
|
||||
elif isinstance(m, nn.BatchNorm2d): # NN.BatchNorm2d
|
||||
init.constant_(m.weight, 1)
|
||||
init.constant_(m.bias, 0)
|
||||
elif isinstance(m, nn.Linear):
|
||||
init.normal_(m.weight, std=0.01)
|
||||
if m.bias is not None:
|
||||
init.constant_(m.bias, 0)
|
||||
|
||||
|
||||
class ATA(nn.Module):
|
||||
def __init__(self, inchannels, reduction=8):
|
||||
super(ATA, self).__init__()
|
||||
self.inchannels = inchannels
|
||||
self.avg_pool = nn.AdaptiveAvgPool2d(1)
|
||||
self.fc = nn.Sequential(nn.Linear(self.inchannels * 2, self.inchannels // reduction),
|
||||
nn.ReLU(inplace=True),
|
||||
nn.Linear(self.inchannels // reduction, self.inchannels),
|
||||
nn.Sigmoid())
|
||||
self.init_params()
|
||||
|
||||
def forward(self, low_x, high_x):
|
||||
n, c, _, _ = low_x.size()
|
||||
x = torch.cat([low_x, high_x], 1)
|
||||
x = self.avg_pool(x)
|
||||
x = x.view(n, -1)
|
||||
x = self.fc(x).view(n, c, 1, 1)
|
||||
x = low_x * x + high_x
|
||||
|
||||
return x
|
||||
|
||||
def init_params(self):
|
||||
for m in self.modules():
|
||||
if isinstance(m, nn.Conv2d):
|
||||
# init.kaiming_normal_(m.weight, mode='fan_out')
|
||||
# init.normal(m.weight, std=0.01)
|
||||
init.xavier_normal_(m.weight)
|
||||
if m.bias is not None:
|
||||
init.constant_(m.bias, 0)
|
||||
elif isinstance(m, nn.ConvTranspose2d):
|
||||
# init.kaiming_normal_(m.weight, mode='fan_out')
|
||||
# init.normal_(m.weight, std=0.01)
|
||||
init.xavier_normal_(m.weight)
|
||||
if m.bias is not None:
|
||||
init.constant_(m.bias, 0)
|
||||
elif isinstance(m, nn.BatchNorm2d): # NN.BatchNorm2d
|
||||
init.constant_(m.weight, 1)
|
||||
init.constant_(m.bias, 0)
|
||||
elif isinstance(m, nn.Linear):
|
||||
init.normal_(m.weight, std=0.01)
|
||||
if m.bias is not None:
|
||||
init.constant_(m.bias, 0)
|
||||
|
||||
|
||||
class FFM(nn.Module):
|
||||
def __init__(self, inchannels, midchannels, outchannels, upfactor=2):
|
||||
super(FFM, self).__init__()
|
||||
self.inchannels = inchannels
|
||||
self.midchannels = midchannels
|
||||
self.outchannels = outchannels
|
||||
self.upfactor = upfactor
|
||||
|
||||
self.ftb1 = FTB(inchannels=self.inchannels, midchannels=self.midchannels)
|
||||
# self.ata = ATA(inchannels = self.midchannels)
|
||||
self.ftb2 = FTB(inchannels=self.midchannels, midchannels=self.outchannels)
|
||||
|
||||
self.upsample = nn.Upsample(scale_factor=self.upfactor, mode='bilinear', align_corners=True)
|
||||
|
||||
self.init_params()
|
||||
|
||||
def forward(self, low_x, high_x):
|
||||
x = self.ftb1(low_x)
|
||||
x = x + high_x
|
||||
x = self.ftb2(x)
|
||||
x = self.upsample(x)
|
||||
|
||||
return x
|
||||
|
||||
def init_params(self):
|
||||
for m in self.modules():
|
||||
if isinstance(m, nn.Conv2d):
|
||||
# init.kaiming_normal_(m.weight, mode='fan_out')
|
||||
init.normal_(m.weight, std=0.01)
|
||||
# init.xavier_normal_(m.weight)
|
||||
if m.bias is not None:
|
||||
init.constant_(m.bias, 0)
|
||||
elif isinstance(m, nn.ConvTranspose2d):
|
||||
# init.kaiming_normal_(m.weight, mode='fan_out')
|
||||
init.normal_(m.weight, std=0.01)
|
||||
# init.xavier_normal_(m.weight)
|
||||
if m.bias is not None:
|
||||
init.constant_(m.bias, 0)
|
||||
elif isinstance(m, nn.BatchNorm2d): # NN.Batchnorm2d
|
||||
init.constant_(m.weight, 1)
|
||||
init.constant_(m.bias, 0)
|
||||
elif isinstance(m, nn.Linear):
|
||||
init.normal_(m.weight, std=0.01)
|
||||
if m.bias is not None:
|
||||
init.constant_(m.bias, 0)
|
||||
|
||||
|
||||
class AO(nn.Module):
|
||||
# Adaptive output module
|
||||
def __init__(self, inchannels, outchannels, upfactor=2):
|
||||
super(AO, self).__init__()
|
||||
self.inchannels = inchannels
|
||||
self.outchannels = outchannels
|
||||
self.upfactor = upfactor
|
||||
|
||||
self.adapt_conv = nn.Sequential(
|
||||
nn.Conv2d(in_channels=self.inchannels, out_channels=self.inchannels // 2, kernel_size=3, padding=1,
|
||||
stride=1, bias=True), \
|
||||
nn.BatchNorm2d(num_features=self.inchannels // 2), \
|
||||
nn.ReLU(inplace=True), \
|
||||
nn.Conv2d(in_channels=self.inchannels // 2, out_channels=self.outchannels, kernel_size=3, padding=1,
|
||||
stride=1, bias=True), \
|
||||
nn.Upsample(scale_factor=self.upfactor, mode='bilinear', align_corners=True))
|
||||
|
||||
self.init_params()
|
||||
|
||||
def forward(self, x):
|
||||
x = self.adapt_conv(x)
|
||||
return x
|
||||
|
||||
def init_params(self):
|
||||
for m in self.modules():
|
||||
if isinstance(m, nn.Conv2d):
|
||||
# init.kaiming_normal_(m.weight, mode='fan_out')
|
||||
init.normal_(m.weight, std=0.01)
|
||||
# init.xavier_normal_(m.weight)
|
||||
if m.bias is not None:
|
||||
init.constant_(m.bias, 0)
|
||||
elif isinstance(m, nn.ConvTranspose2d):
|
||||
# init.kaiming_normal_(m.weight, mode='fan_out')
|
||||
init.normal_(m.weight, std=0.01)
|
||||
# init.xavier_normal_(m.weight)
|
||||
if m.bias is not None:
|
||||
init.constant_(m.bias, 0)
|
||||
elif isinstance(m, nn.BatchNorm2d): # NN.Batchnorm2d
|
||||
init.constant_(m.weight, 1)
|
||||
init.constant_(m.bias, 0)
|
||||
elif isinstance(m, nn.Linear):
|
||||
init.normal_(m.weight, std=0.01)
|
||||
if m.bias is not None:
|
||||
init.constant_(m.bias, 0)
|
||||
|
||||
|
||||
|
||||
# ==============================================================================================================
|
||||
|
||||
|
||||
class ResidualConv(nn.Module):
|
||||
def __init__(self, inchannels):
|
||||
super(ResidualConv, self).__init__()
|
||||
# NN.BatchNorm2d
|
||||
self.conv = nn.Sequential(
|
||||
# nn.BatchNorm2d(num_features=inchannels),
|
||||
nn.ReLU(inplace=False),
|
||||
# nn.Conv2d(in_channels=inchannels, out_channels=inchannels, kernel_size=3, padding=1, stride=1, groups=inchannels,bias=True),
|
||||
# nn.Conv2d(in_channels=inchannels, out_channels=inchannels, kernel_size=1, padding=0, stride=1, groups=1,bias=True)
|
||||
nn.Conv2d(in_channels=inchannels, out_channels=inchannels / 2, kernel_size=3, padding=1, stride=1,
|
||||
bias=False),
|
||||
nn.BatchNorm2d(num_features=inchannels / 2),
|
||||
nn.ReLU(inplace=False),
|
||||
nn.Conv2d(in_channels=inchannels / 2, out_channels=inchannels, kernel_size=3, padding=1, stride=1,
|
||||
bias=False)
|
||||
)
|
||||
self.init_params()
|
||||
|
||||
def forward(self, x):
|
||||
x = self.conv(x) + x
|
||||
return x
|
||||
|
||||
def init_params(self):
|
||||
for m in self.modules():
|
||||
if isinstance(m, nn.Conv2d):
|
||||
# init.kaiming_normal_(m.weight, mode='fan_out')
|
||||
init.normal_(m.weight, std=0.01)
|
||||
# init.xavier_normal_(m.weight)
|
||||
if m.bias is not None:
|
||||
init.constant_(m.bias, 0)
|
||||
elif isinstance(m, nn.ConvTranspose2d):
|
||||
# init.kaiming_normal_(m.weight, mode='fan_out')
|
||||
init.normal_(m.weight, std=0.01)
|
||||
# init.xavier_normal_(m.weight)
|
||||
if m.bias is not None:
|
||||
init.constant_(m.bias, 0)
|
||||
elif isinstance(m, nn.BatchNorm2d): # NN.BatchNorm2d
|
||||
init.constant_(m.weight, 1)
|
||||
init.constant_(m.bias, 0)
|
||||
elif isinstance(m, nn.Linear):
|
||||
init.normal_(m.weight, std=0.01)
|
||||
if m.bias is not None:
|
||||
init.constant_(m.bias, 0)
|
||||
|
||||
|
||||
class FeatureFusion(nn.Module):
|
||||
def __init__(self, inchannels, outchannels):
|
||||
super(FeatureFusion, self).__init__()
|
||||
self.conv = ResidualConv(inchannels=inchannels)
|
||||
# NN.BatchNorm2d
|
||||
self.up = nn.Sequential(ResidualConv(inchannels=inchannels),
|
||||
nn.ConvTranspose2d(in_channels=inchannels, out_channels=outchannels, kernel_size=3,
|
||||
stride=2, padding=1, output_padding=1),
|
||||
nn.BatchNorm2d(num_features=outchannels),
|
||||
nn.ReLU(inplace=True))
|
||||
|
||||
def forward(self, lowfeat, highfeat):
|
||||
return self.up(highfeat + self.conv(lowfeat))
|
||||
|
||||
def init_params(self):
|
||||
for m in self.modules():
|
||||
if isinstance(m, nn.Conv2d):
|
||||
# init.kaiming_normal_(m.weight, mode='fan_out')
|
||||
init.normal_(m.weight, std=0.01)
|
||||
# init.xavier_normal_(m.weight)
|
||||
if m.bias is not None:
|
||||
init.constant_(m.bias, 0)
|
||||
elif isinstance(m, nn.ConvTranspose2d):
|
||||
# init.kaiming_normal_(m.weight, mode='fan_out')
|
||||
init.normal_(m.weight, std=0.01)
|
||||
# init.xavier_normal_(m.weight)
|
||||
if m.bias is not None:
|
||||
init.constant_(m.bias, 0)
|
||||
elif isinstance(m, nn.BatchNorm2d): # NN.BatchNorm2d
|
||||
init.constant_(m.weight, 1)
|
||||
init.constant_(m.bias, 0)
|
||||
elif isinstance(m, nn.Linear):
|
||||
init.normal_(m.weight, std=0.01)
|
||||
if m.bias is not None:
|
||||
init.constant_(m.bias, 0)
|
||||
|
||||
|
||||
class SenceUnderstand(nn.Module):
|
||||
def __init__(self, channels):
|
||||
super(SenceUnderstand, self).__init__()
|
||||
self.channels = channels
|
||||
self.conv1 = nn.Sequential(nn.Conv2d(in_channels=512, out_channels=512, kernel_size=3, padding=1),
|
||||
nn.ReLU(inplace=True))
|
||||
self.pool = nn.AdaptiveAvgPool2d(8)
|
||||
self.fc = nn.Sequential(nn.Linear(512 * 8 * 8, self.channels),
|
||||
nn.ReLU(inplace=True))
|
||||
self.conv2 = nn.Sequential(
|
||||
nn.Conv2d(in_channels=self.channels, out_channels=self.channels, kernel_size=1, padding=0),
|
||||
nn.ReLU(inplace=True))
|
||||
self.initial_params()
|
||||
|
||||
def forward(self, x):
|
||||
n, c, h, w = x.size()
|
||||
x = self.conv1(x)
|
||||
x = self.pool(x)
|
||||
x = x.view(n, -1)
|
||||
x = self.fc(x)
|
||||
x = x.view(n, self.channels, 1, 1)
|
||||
x = self.conv2(x)
|
||||
x = x.repeat(1, 1, h, w)
|
||||
return x
|
||||
|
||||
def initial_params(self, dev=0.01):
|
||||
for m in self.modules():
|
||||
if isinstance(m, nn.Conv2d):
|
||||
# print torch.sum(m.weight)
|
||||
m.weight.data.normal_(0, dev)
|
||||
if m.bias is not None:
|
||||
m.bias.data.fill_(0)
|
||||
elif isinstance(m, nn.ConvTranspose2d):
|
||||
# print torch.sum(m.weight)
|
||||
m.weight.data.normal_(0, dev)
|
||||
if m.bias is not None:
|
||||
m.bias.data.fill_(0)
|
||||
elif isinstance(m, nn.Linear):
|
||||
m.weight.data.normal_(0, dev)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
net = DepthNet(depth=50, pretrained=True)
|
||||
print(net)
|
||||
inputs = torch.ones(4,3,128,128)
|
||||
out = net(inputs)
|
||||
print(out.size())
|
||||
|
||||
@@ -0,0 +1,19 @@
|
||||
https://github.com/compphoto/BoostingMonocularDepth
|
||||
|
||||
Copyright 2021, Seyed Mahdi Hosseini Miangoleh, Sebastian Dille, Computational Photography Laboratory. All rights reserved.
|
||||
|
||||
This software is for academic use only. A redistribution of this
|
||||
software, with or without modifications, has to be for academic
|
||||
use only, while giving the appropriate credit to the original
|
||||
authors of the software. The methods implemented as a part of
|
||||
this software may be covered under patents or patent applications.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE AUTHOR ''AS IS'' AND ANY EXPRESS OR IMPLIED
|
||||
WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
|
||||
FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR
|
||||
CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||
CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
|
||||
ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
|
||||
NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
|
||||
ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
@@ -0,0 +1,67 @@
|
||||
"""This package contains modules related to objective functions, optimizations, and network architectures.
|
||||
|
||||
To add a custom model class called 'dummy', you need to add a file called 'dummy_model.py' and define a subclass DummyModel inherited from BaseModel.
|
||||
You need to implement the following five functions:
|
||||
-- <__init__>: initialize the class; first call BaseModel.__init__(self, opt).
|
||||
-- <set_input>: unpack data from dataset and apply preprocessing.
|
||||
-- <forward>: produce intermediate results.
|
||||
-- <optimize_parameters>: calculate loss, gradients, and update network weights.
|
||||
-- <modify_commandline_options>: (optionally) add model-specific options and set default options.
|
||||
|
||||
In the function <__init__>, you need to define four lists:
|
||||
-- self.loss_names (str list): specify the training losses that you want to plot and save.
|
||||
-- self.model_names (str list): define networks used in our training.
|
||||
-- self.visual_names (str list): specify the images that you want to display and save.
|
||||
-- self.optimizers (optimizer list): define and initialize optimizers. You can define one optimizer for each network. If two networks are updated at the same time, you can use itertools.chain to group them. See cycle_gan_model.py for an usage.
|
||||
|
||||
Now you can use the model class by specifying flag '--model dummy'.
|
||||
See our template model class 'template_model.py' for more details.
|
||||
"""
|
||||
|
||||
import importlib
|
||||
from .base_model import BaseModel
|
||||
|
||||
|
||||
def find_model_using_name(model_name):
|
||||
"""Import the module "models/[model_name]_model.py".
|
||||
|
||||
In the file, the class called DatasetNameModel() will
|
||||
be instantiated. It has to be a subclass of BaseModel,
|
||||
and it is case-insensitive.
|
||||
"""
|
||||
model_filename = "annotator.leres.pix2pix.models." + model_name + "_model"
|
||||
modellib = importlib.import_module(model_filename)
|
||||
model = None
|
||||
target_model_name = model_name.replace('_', '') + 'model'
|
||||
for name, cls in modellib.__dict__.items():
|
||||
if name.lower() == target_model_name.lower() \
|
||||
and issubclass(cls, BaseModel):
|
||||
model = cls
|
||||
|
||||
if model is None:
|
||||
print("In %s.py, there should be a subclass of BaseModel with class name that matches %s in lowercase." % (model_filename, target_model_name))
|
||||
exit(0)
|
||||
|
||||
return model
|
||||
|
||||
|
||||
def get_option_setter(model_name):
|
||||
"""Return the static method <modify_commandline_options> of the model class."""
|
||||
model_class = find_model_using_name(model_name)
|
||||
return model_class.modify_commandline_options
|
||||
|
||||
|
||||
def create_model(opt):
|
||||
"""Create a model given the option.
|
||||
|
||||
This function warps the class CustomDatasetDataLoader.
|
||||
This is the main interface between this package and 'train.py'/'test.py'
|
||||
|
||||
Example:
|
||||
>>> from models import create_model
|
||||
>>> model = create_model(opt)
|
||||
"""
|
||||
model = find_model_using_name(opt.model)
|
||||
instance = model(opt)
|
||||
print("model [%s] was created" % type(instance).__name__)
|
||||
return instance
|
||||
@@ -0,0 +1,241 @@
|
||||
import os
|
||||
import torch, gc
|
||||
from modules import devices
|
||||
from collections import OrderedDict
|
||||
from abc import ABC, abstractmethod
|
||||
from . import networks
|
||||
|
||||
|
||||
class BaseModel(ABC):
|
||||
"""This class is an abstract base class (ABC) for models.
|
||||
To create a subclass, you need to implement the following five functions:
|
||||
-- <__init__>: initialize the class; first call BaseModel.__init__(self, opt).
|
||||
-- <set_input>: unpack data from dataset and apply preprocessing.
|
||||
-- <forward>: produce intermediate results.
|
||||
-- <optimize_parameters>: calculate losses, gradients, and update network weights.
|
||||
-- <modify_commandline_options>: (optionally) add model-specific options and set default options.
|
||||
"""
|
||||
|
||||
def __init__(self, opt):
|
||||
"""Initialize the BaseModel class.
|
||||
|
||||
Parameters:
|
||||
opt (Option class)-- stores all the experiment flags; needs to be a subclass of BaseOptions
|
||||
|
||||
When creating your custom class, you need to implement your own initialization.
|
||||
In this function, you should first call <BaseModel.__init__(self, opt)>
|
||||
Then, you need to define four lists:
|
||||
-- self.loss_names (str list): specify the training losses that you want to plot and save.
|
||||
-- self.model_names (str list): define networks used in our training.
|
||||
-- self.visual_names (str list): specify the images that you want to display and save.
|
||||
-- self.optimizers (optimizer list): define and initialize optimizers. You can define one optimizer for each network. If two networks are updated at the same time, you can use itertools.chain to group them. See cycle_gan_model.py for an example.
|
||||
"""
|
||||
self.opt = opt
|
||||
self.gpu_ids = opt.gpu_ids
|
||||
self.isTrain = opt.isTrain
|
||||
self.device = torch.device('cuda:{}'.format(self.gpu_ids[0])) if self.gpu_ids else torch.device('cpu') # get device name: CPU or GPU
|
||||
self.save_dir = os.path.join(opt.checkpoints_dir, opt.name) # save all the checkpoints to save_dir
|
||||
if opt.preprocess != 'scale_width': # with [scale_width], input images might have different sizes, which hurts the performance of cudnn.benchmark.
|
||||
torch.backends.cudnn.benchmark = True
|
||||
self.loss_names = []
|
||||
self.model_names = []
|
||||
self.visual_names = []
|
||||
self.optimizers = []
|
||||
self.image_paths = []
|
||||
self.metric = 0 # used for learning rate policy 'plateau'
|
||||
|
||||
@staticmethod
|
||||
def modify_commandline_options(parser, is_train):
|
||||
"""Add new model-specific options, and rewrite default values for existing options.
|
||||
|
||||
Parameters:
|
||||
parser -- original option parser
|
||||
is_train (bool) -- whether training phase or test phase. You can use this flag to add training-specific or test-specific options.
|
||||
|
||||
Returns:
|
||||
the modified parser.
|
||||
"""
|
||||
return parser
|
||||
|
||||
@abstractmethod
|
||||
def set_input(self, input):
|
||||
"""Unpack input data from the dataloader and perform necessary pre-processing steps.
|
||||
|
||||
Parameters:
|
||||
input (dict): includes the data itself and its metadata information.
|
||||
"""
|
||||
pass
|
||||
|
||||
@abstractmethod
|
||||
def forward(self):
|
||||
"""Run forward pass; called by both functions <optimize_parameters> and <test>."""
|
||||
pass
|
||||
|
||||
@abstractmethod
|
||||
def optimize_parameters(self):
|
||||
"""Calculate losses, gradients, and update network weights; called in every training iteration"""
|
||||
pass
|
||||
|
||||
def setup(self, opt):
|
||||
"""Load and print networks; create schedulers
|
||||
|
||||
Parameters:
|
||||
opt (Option class) -- stores all the experiment flags; needs to be a subclass of BaseOptions
|
||||
"""
|
||||
if self.isTrain:
|
||||
self.schedulers = [networks.get_scheduler(optimizer, opt) for optimizer in self.optimizers]
|
||||
if not self.isTrain or opt.continue_train:
|
||||
load_suffix = 'iter_%d' % opt.load_iter if opt.load_iter > 0 else opt.epoch
|
||||
self.load_networks(load_suffix)
|
||||
self.print_networks(opt.verbose)
|
||||
|
||||
def eval(self):
|
||||
"""Make models eval mode during test time"""
|
||||
for name in self.model_names:
|
||||
if isinstance(name, str):
|
||||
net = getattr(self, 'net' + name)
|
||||
net.eval()
|
||||
|
||||
def test(self):
|
||||
"""Forward function used in test time.
|
||||
|
||||
This function wraps <forward> function in no_grad() so we don't save intermediate steps for backprop
|
||||
It also calls <compute_visuals> to produce additional visualization results
|
||||
"""
|
||||
with torch.no_grad():
|
||||
self.forward()
|
||||
self.compute_visuals()
|
||||
|
||||
def compute_visuals(self):
|
||||
"""Calculate additional output images for visdom and HTML visualization"""
|
||||
pass
|
||||
|
||||
def get_image_paths(self):
|
||||
""" Return image paths that are used to load current data"""
|
||||
return self.image_paths
|
||||
|
||||
def update_learning_rate(self):
|
||||
"""Update learning rates for all the networks; called at the end of every epoch"""
|
||||
old_lr = self.optimizers[0].param_groups[0]['lr']
|
||||
for scheduler in self.schedulers:
|
||||
if self.opt.lr_policy == 'plateau':
|
||||
scheduler.step(self.metric)
|
||||
else:
|
||||
scheduler.step()
|
||||
|
||||
lr = self.optimizers[0].param_groups[0]['lr']
|
||||
print('learning rate %.7f -> %.7f' % (old_lr, lr))
|
||||
|
||||
def get_current_visuals(self):
|
||||
"""Return visualization images. train.py will display these images with visdom, and save the images to a HTML"""
|
||||
visual_ret = OrderedDict()
|
||||
for name in self.visual_names:
|
||||
if isinstance(name, str):
|
||||
visual_ret[name] = getattr(self, name)
|
||||
return visual_ret
|
||||
|
||||
def get_current_losses(self):
|
||||
"""Return traning losses / errors. train.py will print out these errors on console, and save them to a file"""
|
||||
errors_ret = OrderedDict()
|
||||
for name in self.loss_names:
|
||||
if isinstance(name, str):
|
||||
errors_ret[name] = float(getattr(self, 'loss_' + name)) # float(...) works for both scalar tensor and float number
|
||||
return errors_ret
|
||||
|
||||
def save_networks(self, epoch):
|
||||
"""Save all the networks to the disk.
|
||||
|
||||
Parameters:
|
||||
epoch (int) -- current epoch; used in the file name '%s_net_%s.pth' % (epoch, name)
|
||||
"""
|
||||
for name in self.model_names:
|
||||
if isinstance(name, str):
|
||||
save_filename = '%s_net_%s.pth' % (epoch, name)
|
||||
save_path = os.path.join(self.save_dir, save_filename)
|
||||
net = getattr(self, 'net' + name)
|
||||
|
||||
if len(self.gpu_ids) > 0 and torch.cuda.is_available():
|
||||
torch.save(net.module.cpu().state_dict(), save_path)
|
||||
net.cuda(self.gpu_ids[0])
|
||||
else:
|
||||
torch.save(net.cpu().state_dict(), save_path)
|
||||
|
||||
def unload_network(self, name):
|
||||
"""Unload network and gc.
|
||||
"""
|
||||
if isinstance(name, str):
|
||||
net = getattr(self, 'net' + name)
|
||||
del net
|
||||
gc.collect()
|
||||
devices.torch_gc()
|
||||
return None
|
||||
|
||||
def __patch_instance_norm_state_dict(self, state_dict, module, keys, i=0):
|
||||
"""Fix InstanceNorm checkpoints incompatibility (prior to 0.4)"""
|
||||
key = keys[i]
|
||||
if i + 1 == len(keys): # at the end, pointing to a parameter/buffer
|
||||
if module.__class__.__name__.startswith('InstanceNorm') and \
|
||||
(key == 'running_mean' or key == 'running_var'):
|
||||
if getattr(module, key) is None:
|
||||
state_dict.pop('.'.join(keys))
|
||||
if module.__class__.__name__.startswith('InstanceNorm') and \
|
||||
(key == 'num_batches_tracked'):
|
||||
state_dict.pop('.'.join(keys))
|
||||
else:
|
||||
self.__patch_instance_norm_state_dict(state_dict, getattr(module, key), keys, i + 1)
|
||||
|
||||
def load_networks(self, epoch):
|
||||
"""Load all the networks from the disk.
|
||||
|
||||
Parameters:
|
||||
epoch (int) -- current epoch; used in the file name '%s_net_%s.pth' % (epoch, name)
|
||||
"""
|
||||
for name in self.model_names:
|
||||
if isinstance(name, str):
|
||||
load_filename = '%s_net_%s.pth' % (epoch, name)
|
||||
load_path = os.path.join(self.save_dir, load_filename)
|
||||
net = getattr(self, 'net' + name)
|
||||
if isinstance(net, torch.nn.DataParallel):
|
||||
net = net.module
|
||||
# print('Loading depth boost model from %s' % load_path)
|
||||
# if you are using PyTorch newer than 0.4 (e.g., built from
|
||||
# GitHub source), you can remove str() on self.device
|
||||
state_dict = torch.load(load_path, map_location=str(self.device))
|
||||
if hasattr(state_dict, '_metadata'):
|
||||
del state_dict._metadata
|
||||
|
||||
# patch InstanceNorm checkpoints prior to 0.4
|
||||
for key in list(state_dict.keys()): # need to copy keys here because we mutate in loop
|
||||
self.__patch_instance_norm_state_dict(state_dict, net, key.split('.'))
|
||||
net.load_state_dict(state_dict)
|
||||
|
||||
def print_networks(self, verbose):
|
||||
"""Print the total number of parameters in the network and (if verbose) network architecture
|
||||
|
||||
Parameters:
|
||||
verbose (bool) -- if verbose: print the network architecture
|
||||
"""
|
||||
print('---------- Networks initialized -------------')
|
||||
for name in self.model_names:
|
||||
if isinstance(name, str):
|
||||
net = getattr(self, 'net' + name)
|
||||
num_params = 0
|
||||
for param in net.parameters():
|
||||
num_params += param.numel()
|
||||
if verbose:
|
||||
print(net)
|
||||
print('[Network %s] Total number of parameters : %.3f M' % (name, num_params / 1e6))
|
||||
print('-----------------------------------------------')
|
||||
|
||||
def set_requires_grad(self, nets, requires_grad=False):
|
||||
"""Set requies_grad=Fasle for all the networks to avoid unnecessary computations
|
||||
Parameters:
|
||||
nets (network list) -- a list of networks
|
||||
requires_grad (bool) -- whether the networks require gradients or not
|
||||
"""
|
||||
if not isinstance(nets, list):
|
||||
nets = [nets]
|
||||
for net in nets:
|
||||
if net is not None:
|
||||
for param in net.parameters():
|
||||
param.requires_grad = requires_grad
|
||||
@@ -0,0 +1,58 @@
|
||||
import os
|
||||
import torch
|
||||
|
||||
class BaseModelHG():
|
||||
def name(self):
|
||||
return 'BaseModel'
|
||||
|
||||
def initialize(self, opt):
|
||||
self.opt = opt
|
||||
self.gpu_ids = opt.gpu_ids
|
||||
self.isTrain = opt.isTrain
|
||||
self.Tensor = torch.cuda.FloatTensor if self.gpu_ids else torch.Tensor
|
||||
self.save_dir = os.path.join(opt.checkpoints_dir, opt.name)
|
||||
|
||||
def set_input(self, input):
|
||||
self.input = input
|
||||
|
||||
def forward(self):
|
||||
pass
|
||||
|
||||
# used in test time, no backprop
|
||||
def test(self):
|
||||
pass
|
||||
|
||||
def get_image_paths(self):
|
||||
pass
|
||||
|
||||
def optimize_parameters(self):
|
||||
pass
|
||||
|
||||
def get_current_visuals(self):
|
||||
return self.input
|
||||
|
||||
def get_current_errors(self):
|
||||
return {}
|
||||
|
||||
def save(self, label):
|
||||
pass
|
||||
|
||||
# helper saving function that can be used by subclasses
|
||||
def save_network(self, network, network_label, epoch_label, gpu_ids):
|
||||
save_filename = '_%s_net_%s.pth' % (epoch_label, network_label)
|
||||
save_path = os.path.join(self.save_dir, save_filename)
|
||||
torch.save(network.cpu().state_dict(), save_path)
|
||||
if len(gpu_ids) and torch.cuda.is_available():
|
||||
network.cuda(device_id=gpu_ids[0])
|
||||
|
||||
# helper loading function that can be used by subclasses
|
||||
def load_network(self, network, network_label, epoch_label):
|
||||
save_filename = '%s_net_%s.pth' % (epoch_label, network_label)
|
||||
save_path = os.path.join(self.save_dir, save_filename)
|
||||
print(save_path)
|
||||
model = torch.load(save_path)
|
||||
return model
|
||||
# network.load_state_dict(torch.load(save_path))
|
||||
|
||||
def update_learning_rate():
|
||||
pass
|
||||
@@ -0,0 +1,623 @@
|
||||
import torch
|
||||
import torch.nn as nn
|
||||
from torch.nn import init
|
||||
import functools
|
||||
from torch.optim import lr_scheduler
|
||||
|
||||
|
||||
###############################################################################
|
||||
# Helper Functions
|
||||
###############################################################################
|
||||
|
||||
|
||||
class Identity(nn.Module):
|
||||
def forward(self, x):
|
||||
return x
|
||||
|
||||
|
||||
def get_norm_layer(norm_type='instance'):
|
||||
"""Return a normalization layer
|
||||
|
||||
Parameters:
|
||||
norm_type (str) -- the name of the normalization layer: batch | instance | none
|
||||
|
||||
For BatchNorm, we use learnable affine parameters and track running statistics (mean/stddev).
|
||||
For InstanceNorm, we do not use learnable affine parameters. We do not track running statistics.
|
||||
"""
|
||||
if norm_type == 'batch':
|
||||
norm_layer = functools.partial(nn.BatchNorm2d, affine=True, track_running_stats=True)
|
||||
elif norm_type == 'instance':
|
||||
norm_layer = functools.partial(nn.InstanceNorm2d, affine=False, track_running_stats=False)
|
||||
elif norm_type == 'none':
|
||||
def norm_layer(x): return Identity()
|
||||
else:
|
||||
raise NotImplementedError('normalization layer [%s] is not found' % norm_type)
|
||||
return norm_layer
|
||||
|
||||
|
||||
def get_scheduler(optimizer, opt):
|
||||
"""Return a learning rate scheduler
|
||||
|
||||
Parameters:
|
||||
optimizer -- the optimizer of the network
|
||||
opt (option class) -- stores all the experiment flags; needs to be a subclass of BaseOptions.
|
||||
opt.lr_policy is the name of learning rate policy: linear | step | plateau | cosine
|
||||
|
||||
For 'linear', we keep the same learning rate for the first <opt.n_epochs> epochs
|
||||
and linearly decay the rate to zero over the next <opt.n_epochs_decay> epochs.
|
||||
For other schedulers (step, plateau, and cosine), we use the default PyTorch schedulers.
|
||||
See https://pytorch.org/docs/stable/optim.html for more details.
|
||||
"""
|
||||
if opt.lr_policy == 'linear':
|
||||
def lambda_rule(epoch):
|
||||
lr_l = 1.0 - max(0, epoch + opt.epoch_count - opt.n_epochs) / float(opt.n_epochs_decay + 1)
|
||||
return lr_l
|
||||
scheduler = lr_scheduler.LambdaLR(optimizer, lr_lambda=lambda_rule)
|
||||
elif opt.lr_policy == 'step':
|
||||
scheduler = lr_scheduler.StepLR(optimizer, step_size=opt.lr_decay_iters, gamma=0.1)
|
||||
elif opt.lr_policy == 'plateau':
|
||||
scheduler = lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', factor=0.2, threshold=0.01, patience=5)
|
||||
elif opt.lr_policy == 'cosine':
|
||||
scheduler = lr_scheduler.CosineAnnealingLR(optimizer, T_max=opt.n_epochs, eta_min=0)
|
||||
else:
|
||||
return NotImplementedError('learning rate policy [%s] is not implemented', opt.lr_policy)
|
||||
return scheduler
|
||||
|
||||
|
||||
def init_weights(net, init_type='normal', init_gain=0.02):
|
||||
"""Initialize network weights.
|
||||
|
||||
Parameters:
|
||||
net (network) -- network to be initialized
|
||||
init_type (str) -- the name of an initialization method: normal | xavier | kaiming | orthogonal
|
||||
init_gain (float) -- scaling factor for normal, xavier and orthogonal.
|
||||
|
||||
We use 'normal' in the original pix2pix and CycleGAN paper. But xavier and kaiming might
|
||||
work better for some applications. Feel free to try yourself.
|
||||
"""
|
||||
def init_func(m): # define the initialization function
|
||||
classname = m.__class__.__name__
|
||||
if hasattr(m, 'weight') and (classname.find('Conv') != -1 or classname.find('Linear') != -1):
|
||||
if init_type == 'normal':
|
||||
init.normal_(m.weight.data, 0.0, init_gain)
|
||||
elif init_type == 'xavier':
|
||||
init.xavier_normal_(m.weight.data, gain=init_gain)
|
||||
elif init_type == 'kaiming':
|
||||
init.kaiming_normal_(m.weight.data, a=0, mode='fan_in')
|
||||
elif init_type == 'orthogonal':
|
||||
init.orthogonal_(m.weight.data, gain=init_gain)
|
||||
else:
|
||||
raise NotImplementedError('initialization method [%s] is not implemented' % init_type)
|
||||
if hasattr(m, 'bias') and m.bias is not None:
|
||||
init.constant_(m.bias.data, 0.0)
|
||||
elif classname.find('BatchNorm2d') != -1: # BatchNorm Layer's weight is not a matrix; only normal distribution applies.
|
||||
init.normal_(m.weight.data, 1.0, init_gain)
|
||||
init.constant_(m.bias.data, 0.0)
|
||||
|
||||
# print('initialize network with %s' % init_type)
|
||||
net.apply(init_func) # apply the initialization function <init_func>
|
||||
|
||||
|
||||
def init_net(net, init_type='normal', init_gain=0.02, gpu_ids=[]):
|
||||
"""Initialize a network: 1. register CPU/GPU device (with multi-GPU support); 2. initialize the network weights
|
||||
Parameters:
|
||||
net (network) -- the network to be initialized
|
||||
init_type (str) -- the name of an initialization method: normal | xavier | kaiming | orthogonal
|
||||
gain (float) -- scaling factor for normal, xavier and orthogonal.
|
||||
gpu_ids (int list) -- which GPUs the network runs on: e.g., 0,1,2
|
||||
|
||||
Return an initialized network.
|
||||
"""
|
||||
if len(gpu_ids) > 0:
|
||||
assert(torch.cuda.is_available())
|
||||
net.to(gpu_ids[0])
|
||||
net = torch.nn.DataParallel(net, gpu_ids) # multi-GPUs
|
||||
init_weights(net, init_type, init_gain=init_gain)
|
||||
return net
|
||||
|
||||
|
||||
def define_G(input_nc, output_nc, ngf, netG, norm='batch', use_dropout=False, init_type='normal', init_gain=0.02, gpu_ids=[]):
|
||||
"""Create a generator
|
||||
|
||||
Parameters:
|
||||
input_nc (int) -- the number of channels in input images
|
||||
output_nc (int) -- the number of channels in output images
|
||||
ngf (int) -- the number of filters in the last conv layer
|
||||
netG (str) -- the architecture's name: resnet_9blocks | resnet_6blocks | unet_256 | unet_128
|
||||
norm (str) -- the name of normalization layers used in the network: batch | instance | none
|
||||
use_dropout (bool) -- if use dropout layers.
|
||||
init_type (str) -- the name of our initialization method.
|
||||
init_gain (float) -- scaling factor for normal, xavier and orthogonal.
|
||||
gpu_ids (int list) -- which GPUs the network runs on: e.g., 0,1,2
|
||||
|
||||
Returns a generator
|
||||
|
||||
Our current implementation provides two types of generators:
|
||||
U-Net: [unet_128] (for 128x128 input images) and [unet_256] (for 256x256 input images)
|
||||
The original U-Net paper: https://arxiv.org/abs/1505.04597
|
||||
|
||||
Resnet-based generator: [resnet_6blocks] (with 6 Resnet blocks) and [resnet_9blocks] (with 9 Resnet blocks)
|
||||
Resnet-based generator consists of several Resnet blocks between a few downsampling/upsampling operations.
|
||||
We adapt Torch code from Justin Johnson's neural style transfer project (https://github.com/jcjohnson/fast-neural-style).
|
||||
|
||||
|
||||
The generator has been initialized by <init_net>. It uses RELU for non-linearity.
|
||||
"""
|
||||
net = None
|
||||
norm_layer = get_norm_layer(norm_type=norm)
|
||||
|
||||
if netG == 'resnet_9blocks':
|
||||
net = ResnetGenerator(input_nc, output_nc, ngf, norm_layer=norm_layer, use_dropout=use_dropout, n_blocks=9)
|
||||
elif netG == 'resnet_6blocks':
|
||||
net = ResnetGenerator(input_nc, output_nc, ngf, norm_layer=norm_layer, use_dropout=use_dropout, n_blocks=6)
|
||||
elif netG == 'resnet_12blocks':
|
||||
net = ResnetGenerator(input_nc, output_nc, ngf, norm_layer=norm_layer, use_dropout=use_dropout, n_blocks=12)
|
||||
elif netG == 'unet_128':
|
||||
net = UnetGenerator(input_nc, output_nc, 7, ngf, norm_layer=norm_layer, use_dropout=use_dropout)
|
||||
elif netG == 'unet_256':
|
||||
net = UnetGenerator(input_nc, output_nc, 8, ngf, norm_layer=norm_layer, use_dropout=use_dropout)
|
||||
elif netG == 'unet_672':
|
||||
net = UnetGenerator(input_nc, output_nc, 5, ngf, norm_layer=norm_layer, use_dropout=use_dropout)
|
||||
elif netG == 'unet_960':
|
||||
net = UnetGenerator(input_nc, output_nc, 6, ngf, norm_layer=norm_layer, use_dropout=use_dropout)
|
||||
elif netG == 'unet_1024':
|
||||
net = UnetGenerator(input_nc, output_nc, 10, ngf, norm_layer=norm_layer, use_dropout=use_dropout)
|
||||
else:
|
||||
raise NotImplementedError('Generator model name [%s] is not recognized' % netG)
|
||||
return init_net(net, init_type, init_gain, gpu_ids)
|
||||
|
||||
|
||||
def define_D(input_nc, ndf, netD, n_layers_D=3, norm='batch', init_type='normal', init_gain=0.02, gpu_ids=[]):
|
||||
"""Create a discriminator
|
||||
|
||||
Parameters:
|
||||
input_nc (int) -- the number of channels in input images
|
||||
ndf (int) -- the number of filters in the first conv layer
|
||||
netD (str) -- the architecture's name: basic | n_layers | pixel
|
||||
n_layers_D (int) -- the number of conv layers in the discriminator; effective when netD=='n_layers'
|
||||
norm (str) -- the type of normalization layers used in the network.
|
||||
init_type (str) -- the name of the initialization method.
|
||||
init_gain (float) -- scaling factor for normal, xavier and orthogonal.
|
||||
gpu_ids (int list) -- which GPUs the network runs on: e.g., 0,1,2
|
||||
|
||||
Returns a discriminator
|
||||
|
||||
Our current implementation provides three types of discriminators:
|
||||
[basic]: 'PatchGAN' classifier described in the original pix2pix paper.
|
||||
It can classify whether 70×70 overlapping patches are real or fake.
|
||||
Such a patch-level discriminator architecture has fewer parameters
|
||||
than a full-image discriminator and can work on arbitrarily-sized images
|
||||
in a fully convolutional fashion.
|
||||
|
||||
[n_layers]: With this mode, you can specify the number of conv layers in the discriminator
|
||||
with the parameter <n_layers_D> (default=3 as used in [basic] (PatchGAN).)
|
||||
|
||||
[pixel]: 1x1 PixelGAN discriminator can classify whether a pixel is real or not.
|
||||
It encourages greater color diversity but has no effect on spatial statistics.
|
||||
|
||||
The discriminator has been initialized by <init_net>. It uses Leakly RELU for non-linearity.
|
||||
"""
|
||||
net = None
|
||||
norm_layer = get_norm_layer(norm_type=norm)
|
||||
|
||||
if netD == 'basic': # default PatchGAN classifier
|
||||
net = NLayerDiscriminator(input_nc, ndf, n_layers=3, norm_layer=norm_layer)
|
||||
elif netD == 'n_layers': # more options
|
||||
net = NLayerDiscriminator(input_nc, ndf, n_layers_D, norm_layer=norm_layer)
|
||||
elif netD == 'pixel': # classify if each pixel is real or fake
|
||||
net = PixelDiscriminator(input_nc, ndf, norm_layer=norm_layer)
|
||||
else:
|
||||
raise NotImplementedError('Discriminator model name [%s] is not recognized' % netD)
|
||||
return init_net(net, init_type, init_gain, gpu_ids)
|
||||
|
||||
|
||||
##############################################################################
|
||||
# Classes
|
||||
##############################################################################
|
||||
class GANLoss(nn.Module):
|
||||
"""Define different GAN objectives.
|
||||
|
||||
The GANLoss class abstracts away the need to create the target label tensor
|
||||
that has the same size as the input.
|
||||
"""
|
||||
|
||||
def __init__(self, gan_mode, target_real_label=1.0, target_fake_label=0.0):
|
||||
""" Initialize the GANLoss class.
|
||||
|
||||
Parameters:
|
||||
gan_mode (str) - - the type of GAN objective. It currently supports vanilla, lsgan, and wgangp.
|
||||
target_real_label (bool) - - label for a real image
|
||||
target_fake_label (bool) - - label of a fake image
|
||||
|
||||
Note: Do not use sigmoid as the last layer of Discriminator.
|
||||
LSGAN needs no sigmoid. vanilla GANs will handle it with BCEWithLogitsLoss.
|
||||
"""
|
||||
super(GANLoss, self).__init__()
|
||||
self.register_buffer('real_label', torch.tensor(target_real_label))
|
||||
self.register_buffer('fake_label', torch.tensor(target_fake_label))
|
||||
self.gan_mode = gan_mode
|
||||
if gan_mode == 'lsgan':
|
||||
self.loss = nn.MSELoss()
|
||||
elif gan_mode == 'vanilla':
|
||||
self.loss = nn.BCEWithLogitsLoss()
|
||||
elif gan_mode in ['wgangp']:
|
||||
self.loss = None
|
||||
else:
|
||||
raise NotImplementedError('gan mode %s not implemented' % gan_mode)
|
||||
|
||||
def get_target_tensor(self, prediction, target_is_real):
|
||||
"""Create label tensors with the same size as the input.
|
||||
|
||||
Parameters:
|
||||
prediction (tensor) - - tpyically the prediction from a discriminator
|
||||
target_is_real (bool) - - if the ground truth label is for real images or fake images
|
||||
|
||||
Returns:
|
||||
A label tensor filled with ground truth label, and with the size of the input
|
||||
"""
|
||||
|
||||
if target_is_real:
|
||||
target_tensor = self.real_label
|
||||
else:
|
||||
target_tensor = self.fake_label
|
||||
return target_tensor.expand_as(prediction)
|
||||
|
||||
def __call__(self, prediction, target_is_real):
|
||||
"""Calculate loss given Discriminator's output and grount truth labels.
|
||||
|
||||
Parameters:
|
||||
prediction (tensor) - - tpyically the prediction output from a discriminator
|
||||
target_is_real (bool) - - if the ground truth label is for real images or fake images
|
||||
|
||||
Returns:
|
||||
the calculated loss.
|
||||
"""
|
||||
if self.gan_mode in ['lsgan', 'vanilla']:
|
||||
target_tensor = self.get_target_tensor(prediction, target_is_real)
|
||||
loss = self.loss(prediction, target_tensor)
|
||||
elif self.gan_mode == 'wgangp':
|
||||
if target_is_real:
|
||||
loss = -prediction.mean()
|
||||
else:
|
||||
loss = prediction.mean()
|
||||
return loss
|
||||
|
||||
|
||||
def cal_gradient_penalty(netD, real_data, fake_data, device, type='mixed', constant=1.0, lambda_gp=10.0):
|
||||
"""Calculate the gradient penalty loss, used in WGAN-GP paper https://arxiv.org/abs/1704.00028
|
||||
|
||||
Arguments:
|
||||
netD (network) -- discriminator network
|
||||
real_data (tensor array) -- real images
|
||||
fake_data (tensor array) -- generated images from the generator
|
||||
device (str) -- GPU / CPU: from torch.device('cuda:{}'.format(self.gpu_ids[0])) if self.gpu_ids else torch.device('cpu')
|
||||
type (str) -- if we mix real and fake data or not [real | fake | mixed].
|
||||
constant (float) -- the constant used in formula ( ||gradient||_2 - constant)^2
|
||||
lambda_gp (float) -- weight for this loss
|
||||
|
||||
Returns the gradient penalty loss
|
||||
"""
|
||||
if lambda_gp > 0.0:
|
||||
if type == 'real': # either use real images, fake images, or a linear interpolation of two.
|
||||
interpolatesv = real_data
|
||||
elif type == 'fake':
|
||||
interpolatesv = fake_data
|
||||
elif type == 'mixed':
|
||||
alpha = torch.rand(real_data.shape[0], 1, device=device)
|
||||
alpha = alpha.expand(real_data.shape[0], real_data.nelement() // real_data.shape[0]).contiguous().view(*real_data.shape)
|
||||
interpolatesv = alpha * real_data + ((1 - alpha) * fake_data)
|
||||
else:
|
||||
raise NotImplementedError('{} not implemented'.format(type))
|
||||
interpolatesv.requires_grad_(True)
|
||||
disc_interpolates = netD(interpolatesv)
|
||||
gradients = torch.autograd.grad(outputs=disc_interpolates, inputs=interpolatesv,
|
||||
grad_outputs=torch.ones(disc_interpolates.size()).to(device),
|
||||
create_graph=True, retain_graph=True, only_inputs=True)
|
||||
gradients = gradients[0].view(real_data.size(0), -1) # flat the data
|
||||
gradient_penalty = (((gradients + 1e-16).norm(2, dim=1) - constant) ** 2).mean() * lambda_gp # added eps
|
||||
return gradient_penalty, gradients
|
||||
else:
|
||||
return 0.0, None
|
||||
|
||||
|
||||
class ResnetGenerator(nn.Module):
|
||||
"""Resnet-based generator that consists of Resnet blocks between a few downsampling/upsampling operations.
|
||||
|
||||
We adapt Torch code and idea from Justin Johnson's neural style transfer project(https://github.com/jcjohnson/fast-neural-style)
|
||||
"""
|
||||
|
||||
def __init__(self, input_nc, output_nc, ngf=64, norm_layer=nn.BatchNorm2d, use_dropout=False, n_blocks=6, padding_type='reflect'):
|
||||
"""Construct a Resnet-based generator
|
||||
|
||||
Parameters:
|
||||
input_nc (int) -- the number of channels in input images
|
||||
output_nc (int) -- the number of channels in output images
|
||||
ngf (int) -- the number of filters in the last conv layer
|
||||
norm_layer -- normalization layer
|
||||
use_dropout (bool) -- if use dropout layers
|
||||
n_blocks (int) -- the number of ResNet blocks
|
||||
padding_type (str) -- the name of padding layer in conv layers: reflect | replicate | zero
|
||||
"""
|
||||
assert(n_blocks >= 0)
|
||||
super(ResnetGenerator, self).__init__()
|
||||
if type(norm_layer) == functools.partial:
|
||||
use_bias = norm_layer.func == nn.InstanceNorm2d
|
||||
else:
|
||||
use_bias = norm_layer == nn.InstanceNorm2d
|
||||
|
||||
model = [nn.ReflectionPad2d(3),
|
||||
nn.Conv2d(input_nc, ngf, kernel_size=7, padding=0, bias=use_bias),
|
||||
norm_layer(ngf),
|
||||
nn.ReLU(True)]
|
||||
|
||||
n_downsampling = 2
|
||||
for i in range(n_downsampling): # add downsampling layers
|
||||
mult = 2 ** i
|
||||
model += [nn.Conv2d(ngf * mult, ngf * mult * 2, kernel_size=3, stride=2, padding=1, bias=use_bias),
|
||||
norm_layer(ngf * mult * 2),
|
||||
nn.ReLU(True)]
|
||||
|
||||
mult = 2 ** n_downsampling
|
||||
for i in range(n_blocks): # add ResNet blocks
|
||||
|
||||
model += [ResnetBlock(ngf * mult, padding_type=padding_type, norm_layer=norm_layer, use_dropout=use_dropout, use_bias=use_bias)]
|
||||
|
||||
for i in range(n_downsampling): # add upsampling layers
|
||||
mult = 2 ** (n_downsampling - i)
|
||||
model += [nn.ConvTranspose2d(ngf * mult, int(ngf * mult / 2),
|
||||
kernel_size=3, stride=2,
|
||||
padding=1, output_padding=1,
|
||||
bias=use_bias),
|
||||
norm_layer(int(ngf * mult / 2)),
|
||||
nn.ReLU(True)]
|
||||
model += [nn.ReflectionPad2d(3)]
|
||||
model += [nn.Conv2d(ngf, output_nc, kernel_size=7, padding=0)]
|
||||
model += [nn.Tanh()]
|
||||
|
||||
self.model = nn.Sequential(*model)
|
||||
|
||||
def forward(self, input):
|
||||
"""Standard forward"""
|
||||
return self.model(input)
|
||||
|
||||
|
||||
class ResnetBlock(nn.Module):
|
||||
"""Define a Resnet block"""
|
||||
|
||||
def __init__(self, dim, padding_type, norm_layer, use_dropout, use_bias):
|
||||
"""Initialize the Resnet block
|
||||
|
||||
A resnet block is a conv block with skip connections
|
||||
We construct a conv block with build_conv_block function,
|
||||
and implement skip connections in <forward> function.
|
||||
Original Resnet paper: https://arxiv.org/pdf/1512.03385.pdf
|
||||
"""
|
||||
super(ResnetBlock, self).__init__()
|
||||
self.conv_block = self.build_conv_block(dim, padding_type, norm_layer, use_dropout, use_bias)
|
||||
|
||||
def build_conv_block(self, dim, padding_type, norm_layer, use_dropout, use_bias):
|
||||
"""Construct a convolutional block.
|
||||
|
||||
Parameters:
|
||||
dim (int) -- the number of channels in the conv layer.
|
||||
padding_type (str) -- the name of padding layer: reflect | replicate | zero
|
||||
norm_layer -- normalization layer
|
||||
use_dropout (bool) -- if use dropout layers.
|
||||
use_bias (bool) -- if the conv layer uses bias or not
|
||||
|
||||
Returns a conv block (with a conv layer, a normalization layer, and a non-linearity layer (ReLU))
|
||||
"""
|
||||
conv_block = []
|
||||
p = 0
|
||||
if padding_type == 'reflect':
|
||||
conv_block += [nn.ReflectionPad2d(1)]
|
||||
elif padding_type == 'replicate':
|
||||
conv_block += [nn.ReplicationPad2d(1)]
|
||||
elif padding_type == 'zero':
|
||||
p = 1
|
||||
else:
|
||||
raise NotImplementedError('padding [%s] is not implemented' % padding_type)
|
||||
|
||||
conv_block += [nn.Conv2d(dim, dim, kernel_size=3, padding=p, bias=use_bias), norm_layer(dim), nn.ReLU(True)]
|
||||
if use_dropout:
|
||||
conv_block += [nn.Dropout(0.5)]
|
||||
|
||||
p = 0
|
||||
if padding_type == 'reflect':
|
||||
conv_block += [nn.ReflectionPad2d(1)]
|
||||
elif padding_type == 'replicate':
|
||||
conv_block += [nn.ReplicationPad2d(1)]
|
||||
elif padding_type == 'zero':
|
||||
p = 1
|
||||
else:
|
||||
raise NotImplementedError('padding [%s] is not implemented' % padding_type)
|
||||
conv_block += [nn.Conv2d(dim, dim, kernel_size=3, padding=p, bias=use_bias), norm_layer(dim)]
|
||||
|
||||
return nn.Sequential(*conv_block)
|
||||
|
||||
def forward(self, x):
|
||||
"""Forward function (with skip connections)"""
|
||||
out = x + self.conv_block(x) # add skip connections
|
||||
return out
|
||||
|
||||
|
||||
class UnetGenerator(nn.Module):
|
||||
"""Create a Unet-based generator"""
|
||||
|
||||
def __init__(self, input_nc, output_nc, num_downs, ngf=64, norm_layer=nn.BatchNorm2d, use_dropout=False):
|
||||
"""Construct a Unet generator
|
||||
Parameters:
|
||||
input_nc (int) -- the number of channels in input images
|
||||
output_nc (int) -- the number of channels in output images
|
||||
num_downs (int) -- the number of downsamplings in UNet. For example, # if |num_downs| == 7,
|
||||
image of size 128x128 will become of size 1x1 # at the bottleneck
|
||||
ngf (int) -- the number of filters in the last conv layer
|
||||
norm_layer -- normalization layer
|
||||
|
||||
We construct the U-Net from the innermost layer to the outermost layer.
|
||||
It is a recursive process.
|
||||
"""
|
||||
super(UnetGenerator, self).__init__()
|
||||
# construct unet structure
|
||||
unet_block = UnetSkipConnectionBlock(ngf * 8, ngf * 8, input_nc=None, submodule=None, norm_layer=norm_layer, innermost=True) # add the innermost layer
|
||||
for i in range(num_downs - 5): # add intermediate layers with ngf * 8 filters
|
||||
unet_block = UnetSkipConnectionBlock(ngf * 8, ngf * 8, input_nc=None, submodule=unet_block, norm_layer=norm_layer, use_dropout=use_dropout)
|
||||
# gradually reduce the number of filters from ngf * 8 to ngf
|
||||
unet_block = UnetSkipConnectionBlock(ngf * 4, ngf * 8, input_nc=None, submodule=unet_block, norm_layer=norm_layer)
|
||||
unet_block = UnetSkipConnectionBlock(ngf * 2, ngf * 4, input_nc=None, submodule=unet_block, norm_layer=norm_layer)
|
||||
unet_block = UnetSkipConnectionBlock(ngf, ngf * 2, input_nc=None, submodule=unet_block, norm_layer=norm_layer)
|
||||
self.model = UnetSkipConnectionBlock(output_nc, ngf, input_nc=input_nc, submodule=unet_block, outermost=True, norm_layer=norm_layer) # add the outermost layer
|
||||
|
||||
def forward(self, input):
|
||||
"""Standard forward"""
|
||||
return self.model(input)
|
||||
|
||||
|
||||
class UnetSkipConnectionBlock(nn.Module):
|
||||
"""Defines the Unet submodule with skip connection.
|
||||
X -------------------identity----------------------
|
||||
|-- downsampling -- |submodule| -- upsampling --|
|
||||
"""
|
||||
|
||||
def __init__(self, outer_nc, inner_nc, input_nc=None,
|
||||
submodule=None, outermost=False, innermost=False, norm_layer=nn.BatchNorm2d, use_dropout=False):
|
||||
"""Construct a Unet submodule with skip connections.
|
||||
|
||||
Parameters:
|
||||
outer_nc (int) -- the number of filters in the outer conv layer
|
||||
inner_nc (int) -- the number of filters in the inner conv layer
|
||||
input_nc (int) -- the number of channels in input images/features
|
||||
submodule (UnetSkipConnectionBlock) -- previously defined submodules
|
||||
outermost (bool) -- if this module is the outermost module
|
||||
innermost (bool) -- if this module is the innermost module
|
||||
norm_layer -- normalization layer
|
||||
use_dropout (bool) -- if use dropout layers.
|
||||
"""
|
||||
super(UnetSkipConnectionBlock, self).__init__()
|
||||
self.outermost = outermost
|
||||
if type(norm_layer) == functools.partial:
|
||||
use_bias = norm_layer.func == nn.InstanceNorm2d
|
||||
else:
|
||||
use_bias = norm_layer == nn.InstanceNorm2d
|
||||
if input_nc is None:
|
||||
input_nc = outer_nc
|
||||
downconv = nn.Conv2d(input_nc, inner_nc, kernel_size=4,
|
||||
stride=2, padding=1, bias=use_bias)
|
||||
downrelu = nn.LeakyReLU(0.2, True)
|
||||
downnorm = norm_layer(inner_nc)
|
||||
uprelu = nn.ReLU(True)
|
||||
upnorm = norm_layer(outer_nc)
|
||||
|
||||
if outermost:
|
||||
upconv = nn.ConvTranspose2d(inner_nc * 2, outer_nc,
|
||||
kernel_size=4, stride=2,
|
||||
padding=1)
|
||||
down = [downconv]
|
||||
up = [uprelu, upconv, nn.Tanh()]
|
||||
model = down + [submodule] + up
|
||||
elif innermost:
|
||||
upconv = nn.ConvTranspose2d(inner_nc, outer_nc,
|
||||
kernel_size=4, stride=2,
|
||||
padding=1, bias=use_bias)
|
||||
down = [downrelu, downconv]
|
||||
up = [uprelu, upconv, upnorm]
|
||||
model = down + up
|
||||
else:
|
||||
upconv = nn.ConvTranspose2d(inner_nc * 2, outer_nc,
|
||||
kernel_size=4, stride=2,
|
||||
padding=1, bias=use_bias)
|
||||
down = [downrelu, downconv, downnorm]
|
||||
up = [uprelu, upconv, upnorm]
|
||||
|
||||
if use_dropout:
|
||||
model = down + [submodule] + up + [nn.Dropout(0.5)]
|
||||
else:
|
||||
model = down + [submodule] + up
|
||||
|
||||
self.model = nn.Sequential(*model)
|
||||
|
||||
def forward(self, x):
|
||||
if self.outermost:
|
||||
return self.model(x)
|
||||
else: # add skip connections
|
||||
return torch.cat([x, self.model(x)], 1)
|
||||
|
||||
|
||||
class NLayerDiscriminator(nn.Module):
|
||||
"""Defines a PatchGAN discriminator"""
|
||||
|
||||
def __init__(self, input_nc, ndf=64, n_layers=3, norm_layer=nn.BatchNorm2d):
|
||||
"""Construct a PatchGAN discriminator
|
||||
|
||||
Parameters:
|
||||
input_nc (int) -- the number of channels in input images
|
||||
ndf (int) -- the number of filters in the last conv layer
|
||||
n_layers (int) -- the number of conv layers in the discriminator
|
||||
norm_layer -- normalization layer
|
||||
"""
|
||||
super(NLayerDiscriminator, self).__init__()
|
||||
if type(norm_layer) == functools.partial: # no need to use bias as BatchNorm2d has affine parameters
|
||||
use_bias = norm_layer.func == nn.InstanceNorm2d
|
||||
else:
|
||||
use_bias = norm_layer == nn.InstanceNorm2d
|
||||
|
||||
kw = 4
|
||||
padw = 1
|
||||
sequence = [nn.Conv2d(input_nc, ndf, kernel_size=kw, stride=2, padding=padw), nn.LeakyReLU(0.2, True)]
|
||||
nf_mult = 1
|
||||
nf_mult_prev = 1
|
||||
for n in range(1, n_layers): # gradually increase the number of filters
|
||||
nf_mult_prev = nf_mult
|
||||
nf_mult = min(2 ** n, 8)
|
||||
sequence += [
|
||||
nn.Conv2d(ndf * nf_mult_prev, ndf * nf_mult, kernel_size=kw, stride=2, padding=padw, bias=use_bias),
|
||||
norm_layer(ndf * nf_mult),
|
||||
nn.LeakyReLU(0.2, True)
|
||||
]
|
||||
|
||||
nf_mult_prev = nf_mult
|
||||
nf_mult = min(2 ** n_layers, 8)
|
||||
sequence += [
|
||||
nn.Conv2d(ndf * nf_mult_prev, ndf * nf_mult, kernel_size=kw, stride=1, padding=padw, bias=use_bias),
|
||||
norm_layer(ndf * nf_mult),
|
||||
nn.LeakyReLU(0.2, True)
|
||||
]
|
||||
|
||||
sequence += [nn.Conv2d(ndf * nf_mult, 1, kernel_size=kw, stride=1, padding=padw)] # output 1 channel prediction map
|
||||
self.model = nn.Sequential(*sequence)
|
||||
|
||||
def forward(self, input):
|
||||
"""Standard forward."""
|
||||
return self.model(input)
|
||||
|
||||
|
||||
class PixelDiscriminator(nn.Module):
|
||||
"""Defines a 1x1 PatchGAN discriminator (pixelGAN)"""
|
||||
|
||||
def __init__(self, input_nc, ndf=64, norm_layer=nn.BatchNorm2d):
|
||||
"""Construct a 1x1 PatchGAN discriminator
|
||||
|
||||
Parameters:
|
||||
input_nc (int) -- the number of channels in input images
|
||||
ndf (int) -- the number of filters in the last conv layer
|
||||
norm_layer -- normalization layer
|
||||
"""
|
||||
super(PixelDiscriminator, self).__init__()
|
||||
if type(norm_layer) == functools.partial: # no need to use bias as BatchNorm2d has affine parameters
|
||||
use_bias = norm_layer.func == nn.InstanceNorm2d
|
||||
else:
|
||||
use_bias = norm_layer == nn.InstanceNorm2d
|
||||
|
||||
self.net = [
|
||||
nn.Conv2d(input_nc, ndf, kernel_size=1, stride=1, padding=0),
|
||||
nn.LeakyReLU(0.2, True),
|
||||
nn.Conv2d(ndf, ndf * 2, kernel_size=1, stride=1, padding=0, bias=use_bias),
|
||||
norm_layer(ndf * 2),
|
||||
nn.LeakyReLU(0.2, True),
|
||||
nn.Conv2d(ndf * 2, 1, kernel_size=1, stride=1, padding=0, bias=use_bias)]
|
||||
|
||||
self.net = nn.Sequential(*self.net)
|
||||
|
||||
def forward(self, input):
|
||||
"""Standard forward."""
|
||||
return self.net(input)
|
||||
@@ -0,0 +1,155 @@
|
||||
import torch
|
||||
from .base_model import BaseModel
|
||||
from . import networks
|
||||
|
||||
|
||||
class Pix2Pix4DepthModel(BaseModel):
|
||||
""" This class implements the pix2pix model, for learning a mapping from input images to output images given paired data.
|
||||
|
||||
The model training requires '--dataset_mode aligned' dataset.
|
||||
By default, it uses a '--netG unet256' U-Net generator,
|
||||
a '--netD basic' discriminator (PatchGAN),
|
||||
and a '--gan_mode' vanilla GAN loss (the cross-entropy objective used in the orignal GAN paper).
|
||||
|
||||
pix2pix paper: https://arxiv.org/pdf/1611.07004.pdf
|
||||
"""
|
||||
@staticmethod
|
||||
def modify_commandline_options(parser, is_train=True):
|
||||
"""Add new dataset-specific options, and rewrite default values for existing options.
|
||||
|
||||
Parameters:
|
||||
parser -- original option parser
|
||||
is_train (bool) -- whether training phase or test phase. You can use this flag to add training-specific or test-specific options.
|
||||
|
||||
Returns:
|
||||
the modified parser.
|
||||
|
||||
For pix2pix, we do not use image buffer
|
||||
The training objective is: GAN Loss + lambda_L1 * ||G(A)-B||_1
|
||||
By default, we use vanilla GAN loss, UNet with batchnorm, and aligned datasets.
|
||||
"""
|
||||
# changing the default values to match the pix2pix paper (https://phillipi.github.io/pix2pix/)
|
||||
parser.set_defaults(input_nc=2,output_nc=1,norm='none', netG='unet_1024', dataset_mode='depthmerge')
|
||||
if is_train:
|
||||
parser.set_defaults(pool_size=0, gan_mode='vanilla',)
|
||||
parser.add_argument('--lambda_L1', type=float, default=1000, help='weight for L1 loss')
|
||||
return parser
|
||||
|
||||
def __init__(self, opt):
|
||||
"""Initialize the pix2pix class.
|
||||
|
||||
Parameters:
|
||||
opt (Option class)-- stores all the experiment flags; needs to be a subclass of BaseOptions
|
||||
"""
|
||||
BaseModel.__init__(self, opt)
|
||||
# specify the training losses you want to print out. The training/test scripts will call <BaseModel.get_current_losses>
|
||||
|
||||
self.loss_names = ['G_GAN', 'G_L1', 'D_real', 'D_fake']
|
||||
# self.loss_names = ['G_L1']
|
||||
|
||||
# specify the images you want to save/display. The training/test scripts will call <BaseModel.get_current_visuals>
|
||||
if self.isTrain:
|
||||
self.visual_names = ['outer','inner', 'fake_B', 'real_B']
|
||||
else:
|
||||
self.visual_names = ['fake_B']
|
||||
|
||||
# specify the models you want to save to the disk. The training/test scripts will call <BaseModel.save_networks> and <BaseModel.load_networks>
|
||||
if self.isTrain:
|
||||
self.model_names = ['G','D']
|
||||
else: # during test time, only load G
|
||||
self.model_names = ['G']
|
||||
|
||||
# define networks (both generator and discriminator)
|
||||
self.netG = networks.define_G(opt.input_nc, opt.output_nc, 64, 'unet_1024', 'none',
|
||||
False, 'normal', 0.02, self.gpu_ids)
|
||||
|
||||
if self.isTrain: # define a discriminator; conditional GANs need to take both input and output images; Therefore, #channels for D is input_nc + output_nc
|
||||
self.netD = networks.define_D(opt.input_nc + opt.output_nc, opt.ndf, opt.netD,
|
||||
opt.n_layers_D, opt.norm, opt.init_type, opt.init_gain, self.gpu_ids)
|
||||
|
||||
if self.isTrain:
|
||||
# define loss functions
|
||||
self.criterionGAN = networks.GANLoss(opt.gan_mode).to(self.device)
|
||||
self.criterionL1 = torch.nn.L1Loss()
|
||||
# initialize optimizers; schedulers will be automatically created by function <BaseModel.setup>.
|
||||
self.optimizer_G = torch.optim.Adam(self.netG.parameters(), lr=1e-4, betas=(opt.beta1, 0.999))
|
||||
self.optimizer_D = torch.optim.Adam(self.netD.parameters(), lr=2e-06, betas=(opt.beta1, 0.999))
|
||||
self.optimizers.append(self.optimizer_G)
|
||||
self.optimizers.append(self.optimizer_D)
|
||||
|
||||
def set_input_train(self, input):
|
||||
self.outer = input['data_outer'].to(self.device)
|
||||
self.outer = torch.nn.functional.interpolate(self.outer,(1024,1024),mode='bilinear',align_corners=False)
|
||||
|
||||
self.inner = input['data_inner'].to(self.device)
|
||||
self.inner = torch.nn.functional.interpolate(self.inner,(1024,1024),mode='bilinear',align_corners=False)
|
||||
|
||||
self.image_paths = input['image_path']
|
||||
|
||||
if self.isTrain:
|
||||
self.gtfake = input['data_gtfake'].to(self.device)
|
||||
self.gtfake = torch.nn.functional.interpolate(self.gtfake, (1024, 1024), mode='bilinear', align_corners=False)
|
||||
self.real_B = self.gtfake
|
||||
|
||||
self.real_A = torch.cat((self.outer, self.inner), 1)
|
||||
|
||||
def set_input(self, outer, inner):
|
||||
inner = torch.from_numpy(inner).unsqueeze(0).unsqueeze(0)
|
||||
outer = torch.from_numpy(outer).unsqueeze(0).unsqueeze(0)
|
||||
|
||||
inner = (inner - torch.min(inner))/(torch.max(inner)-torch.min(inner))
|
||||
outer = (outer - torch.min(outer))/(torch.max(outer)-torch.min(outer))
|
||||
|
||||
inner = self.normalize(inner)
|
||||
outer = self.normalize(outer)
|
||||
|
||||
self.real_A = torch.cat((outer, inner), 1).to(self.device)
|
||||
|
||||
|
||||
def normalize(self, input):
|
||||
input = input * 2
|
||||
input = input - 1
|
||||
return input
|
||||
|
||||
def forward(self):
|
||||
"""Run forward pass; called by both functions <optimize_parameters> and <test>."""
|
||||
self.fake_B = self.netG(self.real_A) # G(A)
|
||||
|
||||
def backward_D(self):
|
||||
"""Calculate GAN loss for the discriminator"""
|
||||
# Fake; stop backprop to the generator by detaching fake_B
|
||||
fake_AB = torch.cat((self.real_A, self.fake_B), 1) # we use conditional GANs; we need to feed both input and output to the discriminator
|
||||
pred_fake = self.netD(fake_AB.detach())
|
||||
self.loss_D_fake = self.criterionGAN(pred_fake, False)
|
||||
# Real
|
||||
real_AB = torch.cat((self.real_A, self.real_B), 1)
|
||||
pred_real = self.netD(real_AB)
|
||||
self.loss_D_real = self.criterionGAN(pred_real, True)
|
||||
# combine loss and calculate gradients
|
||||
self.loss_D = (self.loss_D_fake + self.loss_D_real) * 0.5
|
||||
self.loss_D.backward()
|
||||
|
||||
def backward_G(self):
|
||||
"""Calculate GAN and L1 loss for the generator"""
|
||||
# First, G(A) should fake the discriminator
|
||||
fake_AB = torch.cat((self.real_A, self.fake_B), 1)
|
||||
pred_fake = self.netD(fake_AB)
|
||||
self.loss_G_GAN = self.criterionGAN(pred_fake, True)
|
||||
# Second, G(A) = B
|
||||
self.loss_G_L1 = self.criterionL1(self.fake_B, self.real_B) * self.opt.lambda_L1
|
||||
# combine loss and calculate gradients
|
||||
self.loss_G = self.loss_G_L1 + self.loss_G_GAN
|
||||
self.loss_G.backward()
|
||||
|
||||
def optimize_parameters(self):
|
||||
self.forward() # compute fake images: G(A)
|
||||
# update D
|
||||
self.set_requires_grad(self.netD, True) # enable backprop for D
|
||||
self.optimizer_D.zero_grad() # set D's gradients to zero
|
||||
self.backward_D() # calculate gradients for D
|
||||
self.optimizer_D.step() # update D's weights
|
||||
# update G
|
||||
self.set_requires_grad(self.netD, False) # D requires no gradients when optimizing G
|
||||
self.optimizer_G.zero_grad() # set G's gradients to zero
|
||||
self.backward_G() # calculate graidents for G
|
||||
self.optimizer_G.step() # udpate G's weights
|
||||
@@ -0,0 +1 @@
|
||||
"""This package options includes option modules: training options, test options, and basic options (used in both training and test)."""
|
||||
@@ -0,0 +1,156 @@
|
||||
import argparse
|
||||
import os
|
||||
from ...pix2pix.util import util
|
||||
# import torch
|
||||
from ...pix2pix import models
|
||||
# import pix2pix.data
|
||||
import numpy as np
|
||||
|
||||
class BaseOptions():
|
||||
"""This class defines options used during both training and test time.
|
||||
|
||||
It also implements several helper functions such as parsing, printing, and saving the options.
|
||||
It also gathers additional options defined in <modify_commandline_options> functions in both dataset class and model class.
|
||||
"""
|
||||
|
||||
def __init__(self):
|
||||
"""Reset the class; indicates the class hasn't been initailized"""
|
||||
self.initialized = False
|
||||
|
||||
def initialize(self, parser):
|
||||
"""Define the common options that are used in both training and test."""
|
||||
# basic parameters
|
||||
parser.add_argument('--dataroot', help='path to images (should have subfolders trainA, trainB, valA, valB, etc)')
|
||||
parser.add_argument('--name', type=str, default='void', help='mahdi_unet_new, scaled_unet')
|
||||
parser.add_argument('--gpu_ids', type=str, default='0', help='gpu ids: e.g. 0 0,1,2, 0,2. use -1 for CPU')
|
||||
parser.add_argument('--checkpoints_dir', type=str, default='./pix2pix/checkpoints', help='models are saved here')
|
||||
# model parameters
|
||||
parser.add_argument('--model', type=str, default='cycle_gan', help='chooses which model to use. [cycle_gan | pix2pix | test | colorization]')
|
||||
parser.add_argument('--input_nc', type=int, default=2, help='# of input image channels: 3 for RGB and 1 for grayscale')
|
||||
parser.add_argument('--output_nc', type=int, default=1, help='# of output image channels: 3 for RGB and 1 for grayscale')
|
||||
parser.add_argument('--ngf', type=int, default=64, help='# of gen filters in the last conv layer')
|
||||
parser.add_argument('--ndf', type=int, default=64, help='# of discrim filters in the first conv layer')
|
||||
parser.add_argument('--netD', type=str, default='basic', help='specify discriminator architecture [basic | n_layers | pixel]. The basic model is a 70x70 PatchGAN. n_layers allows you to specify the layers in the discriminator')
|
||||
parser.add_argument('--netG', type=str, default='resnet_9blocks', help='specify generator architecture [resnet_9blocks | resnet_6blocks | unet_256 | unet_128]')
|
||||
parser.add_argument('--n_layers_D', type=int, default=3, help='only used if netD==n_layers')
|
||||
parser.add_argument('--norm', type=str, default='instance', help='instance normalization or batch normalization [instance | batch | none]')
|
||||
parser.add_argument('--init_type', type=str, default='normal', help='network initialization [normal | xavier | kaiming | orthogonal]')
|
||||
parser.add_argument('--init_gain', type=float, default=0.02, help='scaling factor for normal, xavier and orthogonal.')
|
||||
parser.add_argument('--no_dropout', action='store_true', help='no dropout for the generator')
|
||||
# dataset parameters
|
||||
parser.add_argument('--dataset_mode', type=str, default='unaligned', help='chooses how datasets are loaded. [unaligned | aligned | single | colorization]')
|
||||
parser.add_argument('--direction', type=str, default='AtoB', help='AtoB or BtoA')
|
||||
parser.add_argument('--serial_batches', action='store_true', help='if true, takes images in order to make batches, otherwise takes them randomly')
|
||||
parser.add_argument('--num_threads', default=4, type=int, help='# threads for loading data')
|
||||
parser.add_argument('--batch_size', type=int, default=1, help='input batch size')
|
||||
parser.add_argument('--load_size', type=int, default=672, help='scale images to this size')
|
||||
parser.add_argument('--crop_size', type=int, default=672, help='then crop to this size')
|
||||
parser.add_argument('--max_dataset_size', type=int, default=10000, help='Maximum number of samples allowed per dataset. If the dataset directory contains more than max_dataset_size, only a subset is loaded.')
|
||||
parser.add_argument('--preprocess', type=str, default='resize_and_crop', help='scaling and cropping of images at load time [resize_and_crop | crop | scale_width | scale_width_and_crop | none]')
|
||||
parser.add_argument('--no_flip', action='store_true', help='if specified, do not flip the images for data augmentation')
|
||||
parser.add_argument('--display_winsize', type=int, default=256, help='display window size for both visdom and HTML')
|
||||
# additional parameters
|
||||
parser.add_argument('--epoch', type=str, default='latest', help='which epoch to load? set to latest to use latest cached model')
|
||||
parser.add_argument('--load_iter', type=int, default='0', help='which iteration to load? if load_iter > 0, the code will load models by iter_[load_iter]; otherwise, the code will load models by [epoch]')
|
||||
parser.add_argument('--verbose', action='store_true', help='if specified, print more debugging information')
|
||||
parser.add_argument('--suffix', default='', type=str, help='customized suffix: opt.name = opt.name + suffix: e.g., {model}_{netG}_size{load_size}')
|
||||
|
||||
parser.add_argument('--data_dir', type=str, required=False,
|
||||
help='input files directory images can be .png .jpg .tiff')
|
||||
parser.add_argument('--output_dir', type=str, required=False,
|
||||
help='result dir. result depth will be png. vides are JMPG as avi')
|
||||
parser.add_argument('--savecrops', type=int, required=False)
|
||||
parser.add_argument('--savewholeest', type=int, required=False)
|
||||
parser.add_argument('--output_resolution', type=int, required=False,
|
||||
help='0 for no restriction 1 for resize to input size')
|
||||
parser.add_argument('--net_receptive_field_size', type=int, required=False)
|
||||
parser.add_argument('--pix2pixsize', type=int, required=False)
|
||||
parser.add_argument('--generatevideo', type=int, required=False)
|
||||
parser.add_argument('--depthNet', type=int, required=False, help='0: midas 1:strurturedRL')
|
||||
parser.add_argument('--R0', action='store_true')
|
||||
parser.add_argument('--R20', action='store_true')
|
||||
parser.add_argument('--Final', action='store_true')
|
||||
parser.add_argument('--colorize_results', action='store_true')
|
||||
parser.add_argument('--max_res', type=float, default=np.inf)
|
||||
|
||||
self.initialized = True
|
||||
return parser
|
||||
|
||||
def gather_options(self):
|
||||
"""Initialize our parser with basic options(only once).
|
||||
Add additional model-specific and dataset-specific options.
|
||||
These options are defined in the <modify_commandline_options> function
|
||||
in model and dataset classes.
|
||||
"""
|
||||
if not self.initialized: # check if it has been initialized
|
||||
parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
|
||||
parser = self.initialize(parser)
|
||||
|
||||
# get the basic options
|
||||
opt, _ = parser.parse_known_args()
|
||||
|
||||
# modify model-related parser options
|
||||
model_name = opt.model
|
||||
model_option_setter = models.get_option_setter(model_name)
|
||||
parser = model_option_setter(parser, self.isTrain)
|
||||
opt, _ = parser.parse_known_args() # parse again with new defaults
|
||||
|
||||
# modify dataset-related parser options
|
||||
# dataset_name = opt.dataset_mode
|
||||
# dataset_option_setter = pix2pix.data.get_option_setter(dataset_name)
|
||||
# parser = dataset_option_setter(parser, self.isTrain)
|
||||
|
||||
# save and return the parser
|
||||
self.parser = parser
|
||||
#return parser.parse_args() #EVIL
|
||||
return opt
|
||||
|
||||
def print_options(self, opt):
|
||||
"""Print and save options
|
||||
|
||||
It will print both current options and default values(if different).
|
||||
It will save options into a text file / [checkpoints_dir] / opt.txt
|
||||
"""
|
||||
message = ''
|
||||
message += '----------------- Options ---------------\n'
|
||||
for k, v in sorted(vars(opt).items()):
|
||||
comment = ''
|
||||
default = self.parser.get_default(k)
|
||||
if v != default:
|
||||
comment = '\t[default: %s]' % str(default)
|
||||
message += '{:>25}: {:<30}{}\n'.format(str(k), str(v), comment)
|
||||
message += '----------------- End -------------------'
|
||||
print(message)
|
||||
|
||||
# save to the disk
|
||||
expr_dir = os.path.join(opt.checkpoints_dir, opt.name)
|
||||
util.mkdirs(expr_dir)
|
||||
file_name = os.path.join(expr_dir, '{}_opt.txt'.format(opt.phase))
|
||||
with open(file_name, 'wt') as opt_file:
|
||||
opt_file.write(message)
|
||||
opt_file.write('\n')
|
||||
|
||||
def parse(self):
|
||||
"""Parse our options, create checkpoints directory suffix, and set up gpu device."""
|
||||
opt = self.gather_options()
|
||||
opt.isTrain = self.isTrain # train or test
|
||||
|
||||
# process opt.suffix
|
||||
if opt.suffix:
|
||||
suffix = ('_' + opt.suffix.format(**vars(opt))) if opt.suffix != '' else ''
|
||||
opt.name = opt.name + suffix
|
||||
|
||||
#self.print_options(opt)
|
||||
|
||||
# set gpu ids
|
||||
str_ids = opt.gpu_ids.split(',')
|
||||
opt.gpu_ids = []
|
||||
for str_id in str_ids:
|
||||
id = int(str_id)
|
||||
if id >= 0:
|
||||
opt.gpu_ids.append(id)
|
||||
#if len(opt.gpu_ids) > 0:
|
||||
# torch.cuda.set_device(opt.gpu_ids[0])
|
||||
|
||||
self.opt = opt
|
||||
return self.opt
|
||||
@@ -0,0 +1,22 @@
|
||||
from .base_options import BaseOptions
|
||||
|
||||
|
||||
class TestOptions(BaseOptions):
|
||||
"""This class includes test options.
|
||||
|
||||
It also includes shared options defined in BaseOptions.
|
||||
"""
|
||||
|
||||
def initialize(self, parser):
|
||||
parser = BaseOptions.initialize(self, parser) # define shared options
|
||||
parser.add_argument('--aspect_ratio', type=float, default=1.0, help='aspect ratio of result images')
|
||||
parser.add_argument('--phase', type=str, default='test', help='train, val, test, etc')
|
||||
# Dropout and Batchnorm has different behavioir during training and test.
|
||||
parser.add_argument('--eval', action='store_true', help='use eval mode during test time.')
|
||||
parser.add_argument('--num_test', type=int, default=50, help='how many test images to run')
|
||||
# rewrite devalue values
|
||||
parser.set_defaults(model='pix2pix4depth')
|
||||
# To avoid cropping, the load_size should be the same as crop_size
|
||||
parser.set_defaults(load_size=parser.get_default('crop_size'))
|
||||
self.isTrain = False
|
||||
return parser
|
||||
@@ -0,0 +1 @@
|
||||
"""This package includes a miscellaneous collection of useful helper functions."""
|
||||
@@ -0,0 +1,110 @@
|
||||
from __future__ import print_function
|
||||
import os
|
||||
import tarfile
|
||||
import requests
|
||||
from warnings import warn
|
||||
from zipfile import ZipFile
|
||||
from bs4 import BeautifulSoup
|
||||
from os.path import abspath, isdir, join, basename
|
||||
|
||||
|
||||
class GetData(object):
|
||||
"""A Python script for downloading CycleGAN or pix2pix datasets.
|
||||
|
||||
Parameters:
|
||||
technique (str) -- One of: 'cyclegan' or 'pix2pix'.
|
||||
verbose (bool) -- If True, print additional information.
|
||||
|
||||
Examples:
|
||||
>>> from util.get_data import GetData
|
||||
>>> gd = GetData(technique='cyclegan')
|
||||
>>> new_data_path = gd.get(save_path='./datasets') # options will be displayed.
|
||||
|
||||
Alternatively, You can use bash scripts: 'scripts/download_pix2pix_model.sh'
|
||||
and 'scripts/download_cyclegan_model.sh'.
|
||||
"""
|
||||
|
||||
def __init__(self, technique='cyclegan', verbose=True):
|
||||
url_dict = {
|
||||
'pix2pix': 'http://efrosgans.eecs.berkeley.edu/pix2pix/datasets/',
|
||||
'cyclegan': 'https://people.eecs.berkeley.edu/~taesung_park/CycleGAN/datasets'
|
||||
}
|
||||
self.url = url_dict.get(technique.lower())
|
||||
self._verbose = verbose
|
||||
|
||||
def _print(self, text):
|
||||
if self._verbose:
|
||||
print(text)
|
||||
|
||||
@staticmethod
|
||||
def _get_options(r):
|
||||
soup = BeautifulSoup(r.text, 'lxml')
|
||||
options = [h.text for h in soup.find_all('a', href=True)
|
||||
if h.text.endswith(('.zip', 'tar.gz'))]
|
||||
return options
|
||||
|
||||
def _present_options(self):
|
||||
r = requests.get(self.url)
|
||||
options = self._get_options(r)
|
||||
print('Options:\n')
|
||||
for i, o in enumerate(options):
|
||||
print("{0}: {1}".format(i, o))
|
||||
choice = input("\nPlease enter the number of the "
|
||||
"dataset above you wish to download:")
|
||||
return options[int(choice)]
|
||||
|
||||
def _download_data(self, dataset_url, save_path):
|
||||
if not isdir(save_path):
|
||||
os.makedirs(save_path)
|
||||
|
||||
base = basename(dataset_url)
|
||||
temp_save_path = join(save_path, base)
|
||||
|
||||
with open(temp_save_path, "wb") as f:
|
||||
r = requests.get(dataset_url)
|
||||
f.write(r.content)
|
||||
|
||||
if base.endswith('.tar.gz'):
|
||||
obj = tarfile.open(temp_save_path)
|
||||
elif base.endswith('.zip'):
|
||||
obj = ZipFile(temp_save_path, 'r')
|
||||
else:
|
||||
raise ValueError("Unknown File Type: {0}.".format(base))
|
||||
|
||||
self._print("Unpacking Data...")
|
||||
obj.extractall(save_path)
|
||||
obj.close()
|
||||
os.remove(temp_save_path)
|
||||
|
||||
def get(self, save_path, dataset=None):
|
||||
"""
|
||||
|
||||
Download a dataset.
|
||||
|
||||
Parameters:
|
||||
save_path (str) -- A directory to save the data to.
|
||||
dataset (str) -- (optional). A specific dataset to download.
|
||||
Note: this must include the file extension.
|
||||
If None, options will be presented for you
|
||||
to choose from.
|
||||
|
||||
Returns:
|
||||
save_path_full (str) -- the absolute path to the downloaded data.
|
||||
|
||||
"""
|
||||
if dataset is None:
|
||||
selected_dataset = self._present_options()
|
||||
else:
|
||||
selected_dataset = dataset
|
||||
|
||||
save_path_full = join(save_path, selected_dataset.split('.')[0])
|
||||
|
||||
if isdir(save_path_full):
|
||||
warn("\n'{0}' already exists. Voiding Download.".format(
|
||||
save_path_full))
|
||||
else:
|
||||
self._print('Downloading Data...')
|
||||
url = "{0}/{1}".format(self.url, selected_dataset)
|
||||
self._download_data(url, save_path=save_path)
|
||||
|
||||
return abspath(save_path_full)
|
||||
@@ -0,0 +1,47 @@
|
||||
import numpy as np
|
||||
|
||||
class GuidedFilter():
|
||||
def __init__(self, source, reference, r=64, eps= 0.05**2):
|
||||
self.source = source;
|
||||
self.reference = reference;
|
||||
self.r = r
|
||||
self.eps = eps
|
||||
|
||||
self.smooth = self.guidedfilter(self.source,self.reference,self.r,self.eps)
|
||||
|
||||
def boxfilter(self,img, r):
|
||||
(rows, cols) = img.shape
|
||||
imDst = np.zeros_like(img)
|
||||
|
||||
imCum = np.cumsum(img, 0)
|
||||
imDst[0 : r+1, :] = imCum[r : 2*r+1, :]
|
||||
imDst[r+1 : rows-r, :] = imCum[2*r+1 : rows, :] - imCum[0 : rows-2*r-1, :]
|
||||
imDst[rows-r: rows, :] = np.tile(imCum[rows-1, :], [r, 1]) - imCum[rows-2*r-1 : rows-r-1, :]
|
||||
|
||||
imCum = np.cumsum(imDst, 1)
|
||||
imDst[:, 0 : r+1] = imCum[:, r : 2*r+1]
|
||||
imDst[:, r+1 : cols-r] = imCum[:, 2*r+1 : cols] - imCum[:, 0 : cols-2*r-1]
|
||||
imDst[:, cols-r: cols] = np.tile(imCum[:, cols-1], [r, 1]).T - imCum[:, cols-2*r-1 : cols-r-1]
|
||||
|
||||
return imDst
|
||||
|
||||
def guidedfilter(self,I, p, r, eps):
|
||||
(rows, cols) = I.shape
|
||||
N = self.boxfilter(np.ones([rows, cols]), r)
|
||||
|
||||
meanI = self.boxfilter(I, r) / N
|
||||
meanP = self.boxfilter(p, r) / N
|
||||
meanIp = self.boxfilter(I * p, r) / N
|
||||
covIp = meanIp - meanI * meanP
|
||||
|
||||
meanII = self.boxfilter(I * I, r) / N
|
||||
varI = meanII - meanI * meanI
|
||||
|
||||
a = covIp / (varI + eps)
|
||||
b = meanP - a * meanI
|
||||
|
||||
meanA = self.boxfilter(a, r) / N
|
||||
meanB = self.boxfilter(b, r) / N
|
||||
|
||||
q = meanA * I + meanB
|
||||
return q
|
||||
@@ -0,0 +1,86 @@
|
||||
import dominate
|
||||
from dominate.tags import meta, h3, table, tr, td, p, a, img, br
|
||||
import os
|
||||
|
||||
|
||||
class HTML:
|
||||
"""This HTML class allows us to save images and write texts into a single HTML file.
|
||||
|
||||
It consists of functions such as <add_header> (add a text header to the HTML file),
|
||||
<add_images> (add a row of images to the HTML file), and <save> (save the HTML to the disk).
|
||||
It is based on Python library 'dominate', a Python library for creating and manipulating HTML documents using a DOM API.
|
||||
"""
|
||||
|
||||
def __init__(self, web_dir, title, refresh=0):
|
||||
"""Initialize the HTML classes
|
||||
|
||||
Parameters:
|
||||
web_dir (str) -- a directory that stores the webpage. HTML file will be created at <web_dir>/index.html; images will be saved at <web_dir/images/
|
||||
title (str) -- the webpage name
|
||||
refresh (int) -- how often the website refresh itself; if 0; no refreshing
|
||||
"""
|
||||
self.title = title
|
||||
self.web_dir = web_dir
|
||||
self.img_dir = os.path.join(self.web_dir, 'images')
|
||||
if not os.path.exists(self.web_dir):
|
||||
os.makedirs(self.web_dir)
|
||||
if not os.path.exists(self.img_dir):
|
||||
os.makedirs(self.img_dir)
|
||||
|
||||
self.doc = dominate.document(title=title)
|
||||
if refresh > 0:
|
||||
with self.doc.head:
|
||||
meta(http_equiv="refresh", content=str(refresh))
|
||||
|
||||
def get_image_dir(self):
|
||||
"""Return the directory that stores images"""
|
||||
return self.img_dir
|
||||
|
||||
def add_header(self, text):
|
||||
"""Insert a header to the HTML file
|
||||
|
||||
Parameters:
|
||||
text (str) -- the header text
|
||||
"""
|
||||
with self.doc:
|
||||
h3(text)
|
||||
|
||||
def add_images(self, ims, txts, links, width=400):
|
||||
"""add images to the HTML file
|
||||
|
||||
Parameters:
|
||||
ims (str list) -- a list of image paths
|
||||
txts (str list) -- a list of image names shown on the website
|
||||
links (str list) -- a list of hyperref links; when you click an image, it will redirect you to a new page
|
||||
"""
|
||||
self.t = table(border=1, style="table-layout: fixed;") # Insert a table
|
||||
self.doc.add(self.t)
|
||||
with self.t:
|
||||
with tr():
|
||||
for im, txt, link in zip(ims, txts, links):
|
||||
with td(style="word-wrap: break-word;", halign="center", valign="top"):
|
||||
with p():
|
||||
with a(href=os.path.join('images', link)):
|
||||
img(style="width:%dpx" % width, src=os.path.join('images', im))
|
||||
br()
|
||||
p(txt)
|
||||
|
||||
def save(self):
|
||||
"""save the current content to the HMTL file"""
|
||||
html_file = '%s/index.html' % self.web_dir
|
||||
f = open(html_file, 'wt')
|
||||
f.write(self.doc.render())
|
||||
f.close()
|
||||
|
||||
|
||||
if __name__ == '__main__': # we show an example usage here.
|
||||
html = HTML('web/', 'test_html')
|
||||
html.add_header('hello world')
|
||||
|
||||
ims, txts, links = [], [], []
|
||||
for n in range(4):
|
||||
ims.append('image_%d.png' % n)
|
||||
txts.append('text_%d' % n)
|
||||
links.append('image_%d.png' % n)
|
||||
html.add_images(ims, txts, links)
|
||||
html.save()
|
||||
@@ -0,0 +1,54 @@
|
||||
import random
|
||||
import torch
|
||||
|
||||
|
||||
class ImagePool():
|
||||
"""This class implements an image buffer that stores previously generated images.
|
||||
|
||||
This buffer enables us to update discriminators using a history of generated images
|
||||
rather than the ones produced by the latest generators.
|
||||
"""
|
||||
|
||||
def __init__(self, pool_size):
|
||||
"""Initialize the ImagePool class
|
||||
|
||||
Parameters:
|
||||
pool_size (int) -- the size of image buffer, if pool_size=0, no buffer will be created
|
||||
"""
|
||||
self.pool_size = pool_size
|
||||
if self.pool_size > 0: # create an empty pool
|
||||
self.num_imgs = 0
|
||||
self.images = []
|
||||
|
||||
def query(self, images):
|
||||
"""Return an image from the pool.
|
||||
|
||||
Parameters:
|
||||
images: the latest generated images from the generator
|
||||
|
||||
Returns images from the buffer.
|
||||
|
||||
By 50/100, the buffer will return input images.
|
||||
By 50/100, the buffer will return images previously stored in the buffer,
|
||||
and insert the current images to the buffer.
|
||||
"""
|
||||
if self.pool_size == 0: # if the buffer size is 0, do nothing
|
||||
return images
|
||||
return_images = []
|
||||
for image in images:
|
||||
image = torch.unsqueeze(image.data, 0)
|
||||
if self.num_imgs < self.pool_size: # if the buffer is not full; keep inserting current images to the buffer
|
||||
self.num_imgs = self.num_imgs + 1
|
||||
self.images.append(image)
|
||||
return_images.append(image)
|
||||
else:
|
||||
p = random.uniform(0, 1)
|
||||
if p > 0.5: # by 50% chance, the buffer will return a previously stored image, and insert the current image into the buffer
|
||||
random_id = random.randint(0, self.pool_size - 1) # randint is inclusive
|
||||
tmp = self.images[random_id].clone()
|
||||
self.images[random_id] = image
|
||||
return_images.append(tmp)
|
||||
else: # by another 50% chance, the buffer will return the current image
|
||||
return_images.append(image)
|
||||
return_images = torch.cat(return_images, 0) # collect all the images and return
|
||||
return return_images
|
||||
@@ -0,0 +1,105 @@
|
||||
"""This module contains simple helper functions """
|
||||
from __future__ import print_function
|
||||
import torch
|
||||
import numpy as np
|
||||
from PIL import Image
|
||||
import os
|
||||
|
||||
|
||||
def tensor2im(input_image, imtype=np.uint16):
|
||||
""""Converts a Tensor array into a numpy image array.
|
||||
|
||||
Parameters:
|
||||
input_image (tensor) -- the input image tensor array
|
||||
imtype (type) -- the desired type of the converted numpy array
|
||||
"""
|
||||
if not isinstance(input_image, np.ndarray):
|
||||
if isinstance(input_image, torch.Tensor): # get the data from a variable
|
||||
image_tensor = input_image.data
|
||||
else:
|
||||
return input_image
|
||||
image_numpy = torch.squeeze(image_tensor).cpu().numpy() # convert it into a numpy array
|
||||
image_numpy = (image_numpy + 1) / 2.0 * (2**16-1) #
|
||||
else: # if it is a numpy array, do nothing
|
||||
image_numpy = input_image
|
||||
return image_numpy.astype(imtype)
|
||||
|
||||
|
||||
def diagnose_network(net, name='network'):
|
||||
"""Calculate and print the mean of average absolute(gradients)
|
||||
|
||||
Parameters:
|
||||
net (torch network) -- Torch network
|
||||
name (str) -- the name of the network
|
||||
"""
|
||||
mean = 0.0
|
||||
count = 0
|
||||
for param in net.parameters():
|
||||
if param.grad is not None:
|
||||
mean += torch.mean(torch.abs(param.grad.data))
|
||||
count += 1
|
||||
if count > 0:
|
||||
mean = mean / count
|
||||
print(name)
|
||||
print(mean)
|
||||
|
||||
|
||||
def save_image(image_numpy, image_path, aspect_ratio=1.0):
|
||||
"""Save a numpy image to the disk
|
||||
|
||||
Parameters:
|
||||
image_numpy (numpy array) -- input numpy array
|
||||
image_path (str) -- the path of the image
|
||||
"""
|
||||
image_pil = Image.fromarray(image_numpy)
|
||||
|
||||
image_pil = image_pil.convert('I;16')
|
||||
|
||||
# image_pil = Image.fromarray(image_numpy)
|
||||
# h, w, _ = image_numpy.shape
|
||||
#
|
||||
# if aspect_ratio > 1.0:
|
||||
# image_pil = image_pil.resize((h, int(w * aspect_ratio)), Image.BICUBIC)
|
||||
# if aspect_ratio < 1.0:
|
||||
# image_pil = image_pil.resize((int(h / aspect_ratio), w), Image.BICUBIC)
|
||||
|
||||
image_pil.save(image_path)
|
||||
|
||||
|
||||
def print_numpy(x, val=True, shp=False):
|
||||
"""Print the mean, min, max, median, std, and size of a numpy array
|
||||
|
||||
Parameters:
|
||||
val (bool) -- if print the values of the numpy array
|
||||
shp (bool) -- if print the shape of the numpy array
|
||||
"""
|
||||
x = x.astype(np.float64)
|
||||
if shp:
|
||||
print('shape,', x.shape)
|
||||
if val:
|
||||
x = x.flatten()
|
||||
print('mean = %3.3f, min = %3.3f, max = %3.3f, median = %3.3f, std=%3.3f' % (
|
||||
np.mean(x), np.min(x), np.max(x), np.median(x), np.std(x)))
|
||||
|
||||
|
||||
def mkdirs(paths):
|
||||
"""create empty directories if they don't exist
|
||||
|
||||
Parameters:
|
||||
paths (str list) -- a list of directory paths
|
||||
"""
|
||||
if isinstance(paths, list) and not isinstance(paths, str):
|
||||
for path in paths:
|
||||
mkdir(path)
|
||||
else:
|
||||
mkdir(paths)
|
||||
|
||||
|
||||
def mkdir(path):
|
||||
"""create a single empty directory if it didn't exist
|
||||
|
||||
Parameters:
|
||||
path (str) -- a single directory path
|
||||
"""
|
||||
if not os.path.exists(path):
|
||||
os.makedirs(path)
|
||||
@@ -0,0 +1,166 @@
|
||||
import numpy as np
|
||||
import os
|
||||
import sys
|
||||
import ntpath
|
||||
import time
|
||||
from . import util, html
|
||||
from subprocess import Popen, PIPE
|
||||
import torch
|
||||
|
||||
|
||||
if sys.version_info[0] == 2:
|
||||
VisdomExceptionBase = Exception
|
||||
else:
|
||||
VisdomExceptionBase = ConnectionError
|
||||
|
||||
|
||||
def save_images(webpage, visuals, image_path, aspect_ratio=1.0, width=256):
|
||||
"""Save images to the disk.
|
||||
|
||||
Parameters:
|
||||
webpage (the HTML class) -- the HTML webpage class that stores these imaegs (see html.py for more details)
|
||||
visuals (OrderedDict) -- an ordered dictionary that stores (name, images (either tensor or numpy) ) pairs
|
||||
image_path (str) -- the string is used to create image paths
|
||||
aspect_ratio (float) -- the aspect ratio of saved images
|
||||
width (int) -- the images will be resized to width x width
|
||||
|
||||
This function will save images stored in 'visuals' to the HTML file specified by 'webpage'.
|
||||
"""
|
||||
image_dir = webpage.get_image_dir()
|
||||
short_path = ntpath.basename(image_path[0])
|
||||
name = os.path.splitext(short_path)[0]
|
||||
|
||||
webpage.add_header(name)
|
||||
ims, txts, links = [], [], []
|
||||
|
||||
for label, im_data in visuals.items():
|
||||
im = util.tensor2im(im_data)
|
||||
image_name = '%s_%s.png' % (name, label)
|
||||
save_path = os.path.join(image_dir, image_name)
|
||||
util.save_image(im, save_path, aspect_ratio=aspect_ratio)
|
||||
ims.append(image_name)
|
||||
txts.append(label)
|
||||
links.append(image_name)
|
||||
webpage.add_images(ims, txts, links, width=width)
|
||||
|
||||
|
||||
class Visualizer():
|
||||
"""This class includes several functions that can display/save images and print/save logging information.
|
||||
|
||||
It uses a Python library 'visdom' for display, and a Python library 'dominate' (wrapped in 'HTML') for creating HTML files with images.
|
||||
"""
|
||||
|
||||
def __init__(self, opt):
|
||||
"""Initialize the Visualizer class
|
||||
|
||||
Parameters:
|
||||
opt -- stores all the experiment flags; needs to be a subclass of BaseOptions
|
||||
Step 1: Cache the training/test options
|
||||
Step 2: connect to a visdom server
|
||||
Step 3: create an HTML object for saveing HTML filters
|
||||
Step 4: create a logging file to store training losses
|
||||
"""
|
||||
self.opt = opt # cache the option
|
||||
self.display_id = opt.display_id
|
||||
self.use_html = opt.isTrain and not opt.no_html
|
||||
self.win_size = opt.display_winsize
|
||||
self.name = opt.name
|
||||
self.port = opt.display_port
|
||||
self.saved = False
|
||||
|
||||
if self.use_html: # create an HTML object at <checkpoints_dir>/web/; images will be saved under <checkpoints_dir>/web/images/
|
||||
self.web_dir = os.path.join(opt.checkpoints_dir, opt.name, 'web')
|
||||
self.img_dir = os.path.join(self.web_dir, 'images')
|
||||
print('create web directory %s...' % self.web_dir)
|
||||
util.mkdirs([self.web_dir, self.img_dir])
|
||||
# create a logging file to store training losses
|
||||
self.log_name = os.path.join(opt.checkpoints_dir, opt.name, 'loss_log.txt')
|
||||
with open(self.log_name, "a") as log_file:
|
||||
now = time.strftime("%c")
|
||||
log_file.write('================ Training Loss (%s) ================\n' % now)
|
||||
|
||||
def reset(self):
|
||||
"""Reset the self.saved status"""
|
||||
self.saved = False
|
||||
|
||||
def create_visdom_connections(self):
|
||||
"""If the program could not connect to Visdom server, this function will start a new server at port < self.port > """
|
||||
cmd = sys.executable + ' -m visdom.server -p %d &>/dev/null &' % self.port
|
||||
print('\n\nCould not connect to Visdom server. \n Trying to start a server....')
|
||||
print('Command: %s' % cmd)
|
||||
Popen(cmd, shell=True, stdout=PIPE, stderr=PIPE)
|
||||
|
||||
def display_current_results(self, visuals, epoch, save_result):
|
||||
"""Display current results on visdom; save current results to an HTML file.
|
||||
|
||||
Parameters:
|
||||
visuals (OrderedDict) - - dictionary of images to display or save
|
||||
epoch (int) - - the current epoch
|
||||
save_result (bool) - - if save the current results to an HTML file
|
||||
"""
|
||||
if self.use_html and (save_result or not self.saved): # save images to an HTML file if they haven't been saved.
|
||||
self.saved = True
|
||||
# save images to the disk
|
||||
for label, image in visuals.items():
|
||||
image_numpy = util.tensor2im(image)
|
||||
img_path = os.path.join(self.img_dir, 'epoch%.3d_%s.png' % (epoch, label))
|
||||
util.save_image(image_numpy, img_path)
|
||||
|
||||
# update website
|
||||
webpage = html.HTML(self.web_dir, 'Experiment name = %s' % self.name, refresh=1)
|
||||
for n in range(epoch, 0, -1):
|
||||
webpage.add_header('epoch [%d]' % n)
|
||||
ims, txts, links = [], [], []
|
||||
|
||||
for label, image_numpy in visuals.items():
|
||||
# image_numpy = util.tensor2im(image)
|
||||
img_path = 'epoch%.3d_%s.png' % (n, label)
|
||||
ims.append(img_path)
|
||||
txts.append(label)
|
||||
links.append(img_path)
|
||||
webpage.add_images(ims, txts, links, width=self.win_size)
|
||||
webpage.save()
|
||||
|
||||
# def plot_current_losses(self, epoch, counter_ratio, losses):
|
||||
# """display the current losses on visdom display: dictionary of error labels and values
|
||||
#
|
||||
# Parameters:
|
||||
# epoch (int) -- current epoch
|
||||
# counter_ratio (float) -- progress (percentage) in the current epoch, between 0 to 1
|
||||
# losses (OrderedDict) -- training losses stored in the format of (name, float) pairs
|
||||
# """
|
||||
# if not hasattr(self, 'plot_data'):
|
||||
# self.plot_data = {'X': [], 'Y': [], 'legend': list(losses.keys())}
|
||||
# self.plot_data['X'].append(epoch + counter_ratio)
|
||||
# self.plot_data['Y'].append([losses[k] for k in self.plot_data['legend']])
|
||||
# try:
|
||||
# self.vis.line(
|
||||
# X=np.stack([np.array(self.plot_data['X'])] * len(self.plot_data['legend']), 1),
|
||||
# Y=np.array(self.plot_data['Y']),
|
||||
# opts={
|
||||
# 'title': self.name + ' loss over time',
|
||||
# 'legend': self.plot_data['legend'],
|
||||
# 'xlabel': 'epoch',
|
||||
# 'ylabel': 'loss'},
|
||||
# win=self.display_id)
|
||||
# except VisdomExceptionBase:
|
||||
# self.create_visdom_connections()
|
||||
|
||||
# losses: same format as |losses| of plot_current_losses
|
||||
def print_current_losses(self, epoch, iters, losses, t_comp, t_data):
|
||||
"""print current losses on console; also save the losses to the disk
|
||||
|
||||
Parameters:
|
||||
epoch (int) -- current epoch
|
||||
iters (int) -- current training iteration during this epoch (reset to 0 at the end of every epoch)
|
||||
losses (OrderedDict) -- training losses stored in the format of (name, float) pairs
|
||||
t_comp (float) -- computational time per data point (normalized by batch_size)
|
||||
t_data (float) -- data loading time per data point (normalized by batch_size)
|
||||
"""
|
||||
message = '(epoch: %d, iters: %d, time: %.3f, data: %.3f) ' % (epoch, iters, t_comp, t_data)
|
||||
for k, v in losses.items():
|
||||
message += '%s: %.3f ' % (k, v)
|
||||
|
||||
print(message) # print the message
|
||||
with open(self.log_name, "a") as log_file:
|
||||
log_file.write('%s\n' % message) # save the message
|
||||
21
extensions-builtin/forge_legacy_preprocessors/annotator/lineart/LICENSE
Executable file
21
extensions-builtin/forge_legacy_preprocessors/annotator/lineart/LICENSE
Executable file
@@ -0,0 +1,21 @@
|
||||
MIT License
|
||||
|
||||
Copyright (c) 2022 Caroline Chan
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
of this software and associated documentation files (the "Software"), to deal
|
||||
in the Software without restriction, including without limitation the rights
|
||||
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
copies of the Software, and to permit persons to whom the Software is
|
||||
furnished to do so, subject to the following conditions:
|
||||
|
||||
The above copyright notice and this permission notice shall be included in all
|
||||
copies or substantial portions of the Software.
|
||||
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
||||
SOFTWARE.
|
||||
133
extensions-builtin/forge_legacy_preprocessors/annotator/lineart/__init__.py
Executable file
133
extensions-builtin/forge_legacy_preprocessors/annotator/lineart/__init__.py
Executable file
@@ -0,0 +1,133 @@
|
||||
import os
|
||||
import cv2
|
||||
import torch
|
||||
import numpy as np
|
||||
|
||||
import torch.nn as nn
|
||||
from einops import rearrange
|
||||
from modules import devices
|
||||
from annotator.annotator_path import models_path
|
||||
|
||||
|
||||
norm_layer = nn.InstanceNorm2d
|
||||
|
||||
|
||||
class ResidualBlock(nn.Module):
|
||||
def __init__(self, in_features):
|
||||
super(ResidualBlock, self).__init__()
|
||||
|
||||
conv_block = [ nn.ReflectionPad2d(1),
|
||||
nn.Conv2d(in_features, in_features, 3),
|
||||
norm_layer(in_features),
|
||||
nn.ReLU(inplace=True),
|
||||
nn.ReflectionPad2d(1),
|
||||
nn.Conv2d(in_features, in_features, 3),
|
||||
norm_layer(in_features)
|
||||
]
|
||||
|
||||
self.conv_block = nn.Sequential(*conv_block)
|
||||
|
||||
def forward(self, x):
|
||||
return x + self.conv_block(x)
|
||||
|
||||
|
||||
class Generator(nn.Module):
|
||||
def __init__(self, input_nc, output_nc, n_residual_blocks=9, sigmoid=True):
|
||||
super(Generator, self).__init__()
|
||||
|
||||
# Initial convolution block
|
||||
model0 = [ nn.ReflectionPad2d(3),
|
||||
nn.Conv2d(input_nc, 64, 7),
|
||||
norm_layer(64),
|
||||
nn.ReLU(inplace=True) ]
|
||||
self.model0 = nn.Sequential(*model0)
|
||||
|
||||
# Downsampling
|
||||
model1 = []
|
||||
in_features = 64
|
||||
out_features = in_features*2
|
||||
for _ in range(2):
|
||||
model1 += [ nn.Conv2d(in_features, out_features, 3, stride=2, padding=1),
|
||||
norm_layer(out_features),
|
||||
nn.ReLU(inplace=True) ]
|
||||
in_features = out_features
|
||||
out_features = in_features*2
|
||||
self.model1 = nn.Sequential(*model1)
|
||||
|
||||
model2 = []
|
||||
# Residual blocks
|
||||
for _ in range(n_residual_blocks):
|
||||
model2 += [ResidualBlock(in_features)]
|
||||
self.model2 = nn.Sequential(*model2)
|
||||
|
||||
# Upsampling
|
||||
model3 = []
|
||||
out_features = in_features//2
|
||||
for _ in range(2):
|
||||
model3 += [ nn.ConvTranspose2d(in_features, out_features, 3, stride=2, padding=1, output_padding=1),
|
||||
norm_layer(out_features),
|
||||
nn.ReLU(inplace=True) ]
|
||||
in_features = out_features
|
||||
out_features = in_features//2
|
||||
self.model3 = nn.Sequential(*model3)
|
||||
|
||||
# Output layer
|
||||
model4 = [ nn.ReflectionPad2d(3),
|
||||
nn.Conv2d(64, output_nc, 7)]
|
||||
if sigmoid:
|
||||
model4 += [nn.Sigmoid()]
|
||||
|
||||
self.model4 = nn.Sequential(*model4)
|
||||
|
||||
def forward(self, x, cond=None):
|
||||
out = self.model0(x)
|
||||
out = self.model1(out)
|
||||
out = self.model2(out)
|
||||
out = self.model3(out)
|
||||
out = self.model4(out)
|
||||
|
||||
return out
|
||||
|
||||
|
||||
class LineartDetector:
|
||||
model_dir = os.path.join(models_path, "lineart")
|
||||
model_default = 'sk_model.pth'
|
||||
model_coarse = 'sk_model2.pth'
|
||||
|
||||
def __init__(self, model_name):
|
||||
self.model = None
|
||||
self.model_name = model_name
|
||||
self.device = devices.get_device_for("controlnet")
|
||||
|
||||
def load_model(self, name):
|
||||
remote_model_path = "https://huggingface.co/lllyasviel/Annotators/resolve/main/" + name
|
||||
model_path = os.path.join(self.model_dir, name)
|
||||
if not os.path.exists(model_path):
|
||||
from modules.modelloader import load_file_from_url
|
||||
load_file_from_url(remote_model_path, model_dir=self.model_dir)
|
||||
model = Generator(3, 1, 3)
|
||||
model.load_state_dict(torch.load(model_path, map_location=torch.device('cpu')))
|
||||
model.eval()
|
||||
self.model = model.to(self.device)
|
||||
|
||||
def unload_model(self):
|
||||
if self.model is not None:
|
||||
self.model.cpu()
|
||||
|
||||
def __call__(self, input_image):
|
||||
if self.model is None:
|
||||
self.load_model(self.model_name)
|
||||
self.model.to(self.device)
|
||||
|
||||
assert input_image.ndim == 3
|
||||
image = input_image
|
||||
with torch.no_grad():
|
||||
image = torch.from_numpy(image).float().to(self.device)
|
||||
image = image / 255.0
|
||||
image = rearrange(image, 'h w c -> 1 c h w')
|
||||
line = self.model(image)[0][0]
|
||||
|
||||
line = line.cpu().numpy()
|
||||
line = (line * 255.0).clip(0, 255).astype(np.uint8)
|
||||
|
||||
return line
|
||||
@@ -0,0 +1,21 @@
|
||||
MIT License
|
||||
|
||||
Copyright (c) 2022 Caroline Chan
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
of this software and associated documentation files (the "Software"), to deal
|
||||
in the Software without restriction, including without limitation the rights
|
||||
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
copies of the Software, and to permit persons to whom the Software is
|
||||
furnished to do so, subject to the following conditions:
|
||||
|
||||
The above copyright notice and this permission notice shall be included in all
|
||||
copies or substantial portions of the Software.
|
||||
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
||||
SOFTWARE.
|
||||
@@ -0,0 +1,161 @@
|
||||
import numpy as np
|
||||
import torch
|
||||
import torch.nn as nn
|
||||
import functools
|
||||
|
||||
import os
|
||||
import cv2
|
||||
from einops import rearrange
|
||||
from modules import devices
|
||||
from annotator.annotator_path import models_path
|
||||
|
||||
|
||||
class UnetGenerator(nn.Module):
|
||||
"""Create a Unet-based generator"""
|
||||
|
||||
def __init__(self, input_nc, output_nc, num_downs, ngf=64, norm_layer=nn.BatchNorm2d, use_dropout=False):
|
||||
"""Construct a Unet generator
|
||||
Parameters:
|
||||
input_nc (int) -- the number of channels in input images
|
||||
output_nc (int) -- the number of channels in output images
|
||||
num_downs (int) -- the number of downsamplings in UNet. For example, # if |num_downs| == 7,
|
||||
image of size 128x128 will become of size 1x1 # at the bottleneck
|
||||
ngf (int) -- the number of filters in the last conv layer
|
||||
norm_layer -- normalization layer
|
||||
We construct the U-Net from the innermost layer to the outermost layer.
|
||||
It is a recursive process.
|
||||
"""
|
||||
super(UnetGenerator, self).__init__()
|
||||
# construct unet structure
|
||||
unet_block = UnetSkipConnectionBlock(ngf * 8, ngf * 8, input_nc=None, submodule=None, norm_layer=norm_layer, innermost=True) # add the innermost layer
|
||||
for _ in range(num_downs - 5): # add intermediate layers with ngf * 8 filters
|
||||
unet_block = UnetSkipConnectionBlock(ngf * 8, ngf * 8, input_nc=None, submodule=unet_block, norm_layer=norm_layer, use_dropout=use_dropout)
|
||||
# gradually reduce the number of filters from ngf * 8 to ngf
|
||||
unet_block = UnetSkipConnectionBlock(ngf * 4, ngf * 8, input_nc=None, submodule=unet_block, norm_layer=norm_layer)
|
||||
unet_block = UnetSkipConnectionBlock(ngf * 2, ngf * 4, input_nc=None, submodule=unet_block, norm_layer=norm_layer)
|
||||
unet_block = UnetSkipConnectionBlock(ngf, ngf * 2, input_nc=None, submodule=unet_block, norm_layer=norm_layer)
|
||||
self.model = UnetSkipConnectionBlock(output_nc, ngf, input_nc=input_nc, submodule=unet_block, outermost=True, norm_layer=norm_layer) # add the outermost layer
|
||||
|
||||
def forward(self, input):
|
||||
"""Standard forward"""
|
||||
return self.model(input)
|
||||
|
||||
|
||||
class UnetSkipConnectionBlock(nn.Module):
|
||||
"""Defines the Unet submodule with skip connection.
|
||||
X -------------------identity----------------------
|
||||
|-- downsampling -- |submodule| -- upsampling --|
|
||||
"""
|
||||
|
||||
def __init__(self, outer_nc, inner_nc, input_nc=None,
|
||||
submodule=None, outermost=False, innermost=False, norm_layer=nn.BatchNorm2d, use_dropout=False):
|
||||
"""Construct a Unet submodule with skip connections.
|
||||
Parameters:
|
||||
outer_nc (int) -- the number of filters in the outer conv layer
|
||||
inner_nc (int) -- the number of filters in the inner conv layer
|
||||
input_nc (int) -- the number of channels in input images/features
|
||||
submodule (UnetSkipConnectionBlock) -- previously defined submodules
|
||||
outermost (bool) -- if this module is the outermost module
|
||||
innermost (bool) -- if this module is the innermost module
|
||||
norm_layer -- normalization layer
|
||||
use_dropout (bool) -- if use dropout layers.
|
||||
"""
|
||||
super(UnetSkipConnectionBlock, self).__init__()
|
||||
self.outermost = outermost
|
||||
if type(norm_layer) == functools.partial:
|
||||
use_bias = norm_layer.func == nn.InstanceNorm2d
|
||||
else:
|
||||
use_bias = norm_layer == nn.InstanceNorm2d
|
||||
if input_nc is None:
|
||||
input_nc = outer_nc
|
||||
downconv = nn.Conv2d(input_nc, inner_nc, kernel_size=4,
|
||||
stride=2, padding=1, bias=use_bias)
|
||||
downrelu = nn.LeakyReLU(0.2, True)
|
||||
downnorm = norm_layer(inner_nc)
|
||||
uprelu = nn.ReLU(True)
|
||||
upnorm = norm_layer(outer_nc)
|
||||
|
||||
if outermost:
|
||||
upconv = nn.ConvTranspose2d(inner_nc * 2, outer_nc,
|
||||
kernel_size=4, stride=2,
|
||||
padding=1)
|
||||
down = [downconv]
|
||||
up = [uprelu, upconv, nn.Tanh()]
|
||||
model = down + [submodule] + up
|
||||
elif innermost:
|
||||
upconv = nn.ConvTranspose2d(inner_nc, outer_nc,
|
||||
kernel_size=4, stride=2,
|
||||
padding=1, bias=use_bias)
|
||||
down = [downrelu, downconv]
|
||||
up = [uprelu, upconv, upnorm]
|
||||
model = down + up
|
||||
else:
|
||||
upconv = nn.ConvTranspose2d(inner_nc * 2, outer_nc,
|
||||
kernel_size=4, stride=2,
|
||||
padding=1, bias=use_bias)
|
||||
down = [downrelu, downconv, downnorm]
|
||||
up = [uprelu, upconv, upnorm]
|
||||
|
||||
if use_dropout:
|
||||
model = down + [submodule] + up + [nn.Dropout(0.5)]
|
||||
else:
|
||||
model = down + [submodule] + up
|
||||
|
||||
self.model = nn.Sequential(*model)
|
||||
|
||||
def forward(self, x):
|
||||
if self.outermost:
|
||||
return self.model(x)
|
||||
else: # add skip connections
|
||||
return torch.cat([x, self.model(x)], 1)
|
||||
|
||||
|
||||
class LineartAnimeDetector:
|
||||
model_dir = os.path.join(models_path, "lineart_anime")
|
||||
|
||||
def __init__(self):
|
||||
self.model = None
|
||||
self.device = devices.get_device_for("controlnet")
|
||||
|
||||
def load_model(self):
|
||||
remote_model_path = "https://huggingface.co/lllyasviel/Annotators/resolve/main/netG.pth"
|
||||
modelpath = os.path.join(self.model_dir, "netG.pth")
|
||||
if not os.path.exists(modelpath):
|
||||
from modules.modelloader import load_file_from_url
|
||||
load_file_from_url(remote_model_path, model_dir=self.model_dir)
|
||||
norm_layer = functools.partial(nn.InstanceNorm2d, affine=False, track_running_stats=False)
|
||||
net = UnetGenerator(3, 1, 8, 64, norm_layer=norm_layer, use_dropout=False)
|
||||
ckpt = torch.load(modelpath)
|
||||
for key in list(ckpt.keys()):
|
||||
if 'module.' in key:
|
||||
ckpt[key.replace('module.', '')] = ckpt[key]
|
||||
del ckpt[key]
|
||||
net.load_state_dict(ckpt)
|
||||
net.eval()
|
||||
self.model = net.to(self.device)
|
||||
|
||||
def unload_model(self):
|
||||
if self.model is not None:
|
||||
self.model.cpu()
|
||||
|
||||
def __call__(self, input_image):
|
||||
if self.model is None:
|
||||
self.load_model()
|
||||
self.model.to(self.device)
|
||||
|
||||
H, W, C = input_image.shape
|
||||
Hn = 256 * int(np.ceil(float(H) / 256.0))
|
||||
Wn = 256 * int(np.ceil(float(W) / 256.0))
|
||||
img = cv2.resize(input_image, (Wn, Hn), interpolation=cv2.INTER_CUBIC)
|
||||
with torch.no_grad():
|
||||
image_feed = torch.from_numpy(img).float().to(self.device)
|
||||
image_feed = image_feed / 127.5 - 1.0
|
||||
image_feed = rearrange(image_feed, 'h w c -> 1 c h w')
|
||||
|
||||
line = self.model(image_feed)[0, 0] * 127.5 + 127.5
|
||||
line = line.cpu().numpy()
|
||||
|
||||
line = cv2.resize(line, (W, H), interpolation=cv2.INTER_CUBIC)
|
||||
line = line.clip(0, 255).astype(np.uint8)
|
||||
return line
|
||||
|
||||
21
extensions-builtin/forge_legacy_preprocessors/annotator/manga_line/LICENSE
Executable file
21
extensions-builtin/forge_legacy_preprocessors/annotator/manga_line/LICENSE
Executable file
@@ -0,0 +1,21 @@
|
||||
MIT License
|
||||
|
||||
Copyright (c) 2021 Miaomiao Li
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
of this software and associated documentation files (the "Software"), to deal
|
||||
in the Software without restriction, including without limitation the rights
|
||||
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
copies of the Software, and to permit persons to whom the Software is
|
||||
furnished to do so, subject to the following conditions:
|
||||
|
||||
The above copyright notice and this permission notice shall be included in all
|
||||
copies or substantial portions of the Software.
|
||||
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
||||
SOFTWARE.
|
||||
248
extensions-builtin/forge_legacy_preprocessors/annotator/manga_line/__init__.py
Executable file
248
extensions-builtin/forge_legacy_preprocessors/annotator/manga_line/__init__.py
Executable file
@@ -0,0 +1,248 @@
|
||||
import os
|
||||
import torch
|
||||
import torch.nn as nn
|
||||
from PIL import Image
|
||||
import fnmatch
|
||||
import cv2
|
||||
|
||||
import sys
|
||||
|
||||
import numpy as np
|
||||
from einops import rearrange
|
||||
from modules import devices
|
||||
from annotator.annotator_path import models_path
|
||||
|
||||
|
||||
class _bn_relu_conv(nn.Module):
|
||||
def __init__(self, in_filters, nb_filters, fw, fh, subsample=1):
|
||||
super(_bn_relu_conv, self).__init__()
|
||||
self.model = nn.Sequential(
|
||||
nn.BatchNorm2d(in_filters, eps=1e-3),
|
||||
nn.LeakyReLU(0.2),
|
||||
nn.Conv2d(in_filters, nb_filters, (fw, fh), stride=subsample, padding=(fw//2, fh//2), padding_mode='zeros')
|
||||
)
|
||||
|
||||
def forward(self, x):
|
||||
return self.model(x)
|
||||
|
||||
# the following are for debugs
|
||||
print("****", np.max(x.cpu().numpy()), np.min(x.cpu().numpy()), np.mean(x.cpu().numpy()), np.std(x.cpu().numpy()), x.shape)
|
||||
for i,layer in enumerate(self.model):
|
||||
if i != 2:
|
||||
x = layer(x)
|
||||
else:
|
||||
x = layer(x)
|
||||
#x = nn.functional.pad(x, (1, 1, 1, 1), mode='constant', value=0)
|
||||
print("____", np.max(x.cpu().numpy()), np.min(x.cpu().numpy()), np.mean(x.cpu().numpy()), np.std(x.cpu().numpy()), x.shape)
|
||||
print(x[0])
|
||||
return x
|
||||
|
||||
class _u_bn_relu_conv(nn.Module):
|
||||
def __init__(self, in_filters, nb_filters, fw, fh, subsample=1):
|
||||
super(_u_bn_relu_conv, self).__init__()
|
||||
self.model = nn.Sequential(
|
||||
nn.BatchNorm2d(in_filters, eps=1e-3),
|
||||
nn.LeakyReLU(0.2),
|
||||
nn.Conv2d(in_filters, nb_filters, (fw, fh), stride=subsample, padding=(fw//2, fh//2)),
|
||||
nn.Upsample(scale_factor=2, mode='nearest')
|
||||
)
|
||||
|
||||
def forward(self, x):
|
||||
return self.model(x)
|
||||
|
||||
|
||||
|
||||
class _shortcut(nn.Module):
|
||||
def __init__(self, in_filters, nb_filters, subsample=1):
|
||||
super(_shortcut, self).__init__()
|
||||
self.process = False
|
||||
self.model = None
|
||||
if in_filters != nb_filters or subsample != 1:
|
||||
self.process = True
|
||||
self.model = nn.Sequential(
|
||||
nn.Conv2d(in_filters, nb_filters, (1, 1), stride=subsample)
|
||||
)
|
||||
|
||||
def forward(self, x, y):
|
||||
#print(x.size(), y.size(), self.process)
|
||||
if self.process:
|
||||
y0 = self.model(x)
|
||||
#print("merge+", torch.max(y0+y), torch.min(y0+y),torch.mean(y0+y), torch.std(y0+y), y0.shape)
|
||||
return y0 + y
|
||||
else:
|
||||
#print("merge", torch.max(x+y), torch.min(x+y),torch.mean(x+y), torch.std(x+y), y.shape)
|
||||
return x + y
|
||||
|
||||
class _u_shortcut(nn.Module):
|
||||
def __init__(self, in_filters, nb_filters, subsample):
|
||||
super(_u_shortcut, self).__init__()
|
||||
self.process = False
|
||||
self.model = None
|
||||
if in_filters != nb_filters:
|
||||
self.process = True
|
||||
self.model = nn.Sequential(
|
||||
nn.Conv2d(in_filters, nb_filters, (1, 1), stride=subsample, padding_mode='zeros'),
|
||||
nn.Upsample(scale_factor=2, mode='nearest')
|
||||
)
|
||||
|
||||
def forward(self, x, y):
|
||||
if self.process:
|
||||
return self.model(x) + y
|
||||
else:
|
||||
return x + y
|
||||
|
||||
|
||||
class basic_block(nn.Module):
|
||||
def __init__(self, in_filters, nb_filters, init_subsample=1):
|
||||
super(basic_block, self).__init__()
|
||||
self.conv1 = _bn_relu_conv(in_filters, nb_filters, 3, 3, subsample=init_subsample)
|
||||
self.residual = _bn_relu_conv(nb_filters, nb_filters, 3, 3)
|
||||
self.shortcut = _shortcut(in_filters, nb_filters, subsample=init_subsample)
|
||||
|
||||
def forward(self, x):
|
||||
x1 = self.conv1(x)
|
||||
x2 = self.residual(x1)
|
||||
return self.shortcut(x, x2)
|
||||
|
||||
class _u_basic_block(nn.Module):
|
||||
def __init__(self, in_filters, nb_filters, init_subsample=1):
|
||||
super(_u_basic_block, self).__init__()
|
||||
self.conv1 = _u_bn_relu_conv(in_filters, nb_filters, 3, 3, subsample=init_subsample)
|
||||
self.residual = _bn_relu_conv(nb_filters, nb_filters, 3, 3)
|
||||
self.shortcut = _u_shortcut(in_filters, nb_filters, subsample=init_subsample)
|
||||
|
||||
def forward(self, x):
|
||||
y = self.residual(self.conv1(x))
|
||||
return self.shortcut(x, y)
|
||||
|
||||
|
||||
class _residual_block(nn.Module):
|
||||
def __init__(self, in_filters, nb_filters, repetitions, is_first_layer=False):
|
||||
super(_residual_block, self).__init__()
|
||||
layers = []
|
||||
for i in range(repetitions):
|
||||
init_subsample = 1
|
||||
if i == repetitions - 1 and not is_first_layer:
|
||||
init_subsample = 2
|
||||
if i == 0:
|
||||
l = basic_block(in_filters=in_filters, nb_filters=nb_filters, init_subsample=init_subsample)
|
||||
else:
|
||||
l = basic_block(in_filters=nb_filters, nb_filters=nb_filters, init_subsample=init_subsample)
|
||||
layers.append(l)
|
||||
|
||||
self.model = nn.Sequential(*layers)
|
||||
|
||||
def forward(self, x):
|
||||
return self.model(x)
|
||||
|
||||
|
||||
class _upsampling_residual_block(nn.Module):
|
||||
def __init__(self, in_filters, nb_filters, repetitions):
|
||||
super(_upsampling_residual_block, self).__init__()
|
||||
layers = []
|
||||
for i in range(repetitions):
|
||||
l = None
|
||||
if i == 0:
|
||||
l = _u_basic_block(in_filters=in_filters, nb_filters=nb_filters)#(input)
|
||||
else:
|
||||
l = basic_block(in_filters=nb_filters, nb_filters=nb_filters)#(input)
|
||||
layers.append(l)
|
||||
|
||||
self.model = nn.Sequential(*layers)
|
||||
|
||||
def forward(self, x):
|
||||
return self.model(x)
|
||||
|
||||
|
||||
class res_skip(nn.Module):
|
||||
|
||||
def __init__(self):
|
||||
super(res_skip, self).__init__()
|
||||
self.block0 = _residual_block(in_filters=1, nb_filters=24, repetitions=2, is_first_layer=True)#(input)
|
||||
self.block1 = _residual_block(in_filters=24, nb_filters=48, repetitions=3)#(block0)
|
||||
self.block2 = _residual_block(in_filters=48, nb_filters=96, repetitions=5)#(block1)
|
||||
self.block3 = _residual_block(in_filters=96, nb_filters=192, repetitions=7)#(block2)
|
||||
self.block4 = _residual_block(in_filters=192, nb_filters=384, repetitions=12)#(block3)
|
||||
|
||||
self.block5 = _upsampling_residual_block(in_filters=384, nb_filters=192, repetitions=7)#(block4)
|
||||
self.res1 = _shortcut(in_filters=192, nb_filters=192)#(block3, block5, subsample=(1,1))
|
||||
|
||||
self.block6 = _upsampling_residual_block(in_filters=192, nb_filters=96, repetitions=5)#(res1)
|
||||
self.res2 = _shortcut(in_filters=96, nb_filters=96)#(block2, block6, subsample=(1,1))
|
||||
|
||||
self.block7 = _upsampling_residual_block(in_filters=96, nb_filters=48, repetitions=3)#(res2)
|
||||
self.res3 = _shortcut(in_filters=48, nb_filters=48)#(block1, block7, subsample=(1,1))
|
||||
|
||||
self.block8 = _upsampling_residual_block(in_filters=48, nb_filters=24, repetitions=2)#(res3)
|
||||
self.res4 = _shortcut(in_filters=24, nb_filters=24)#(block0,block8, subsample=(1,1))
|
||||
|
||||
self.block9 = _residual_block(in_filters=24, nb_filters=16, repetitions=2, is_first_layer=True)#(res4)
|
||||
self.conv15 = _bn_relu_conv(in_filters=16, nb_filters=1, fh=1, fw=1, subsample=1)#(block7)
|
||||
|
||||
def forward(self, x):
|
||||
x0 = self.block0(x)
|
||||
x1 = self.block1(x0)
|
||||
x2 = self.block2(x1)
|
||||
x3 = self.block3(x2)
|
||||
x4 = self.block4(x3)
|
||||
|
||||
x5 = self.block5(x4)
|
||||
res1 = self.res1(x3, x5)
|
||||
|
||||
x6 = self.block6(res1)
|
||||
res2 = self.res2(x2, x6)
|
||||
|
||||
x7 = self.block7(res2)
|
||||
res3 = self.res3(x1, x7)
|
||||
|
||||
x8 = self.block8(res3)
|
||||
res4 = self.res4(x0, x8)
|
||||
|
||||
x9 = self.block9(res4)
|
||||
y = self.conv15(x9)
|
||||
|
||||
return y
|
||||
|
||||
|
||||
class MangaLineExtration:
|
||||
model_dir = os.path.join(models_path, "manga_line")
|
||||
|
||||
def __init__(self):
|
||||
self.model = None
|
||||
self.device = devices.get_device_for("controlnet")
|
||||
|
||||
def load_model(self):
|
||||
remote_model_path = "https://huggingface.co/lllyasviel/Annotators/resolve/main/erika.pth"
|
||||
modelpath = os.path.join(self.model_dir, "erika.pth")
|
||||
if not os.path.exists(modelpath):
|
||||
from modules.modelloader import load_file_from_url
|
||||
load_file_from_url(remote_model_path, model_dir=self.model_dir)
|
||||
#norm_layer = functools.partial(nn.InstanceNorm2d, affine=False, track_running_stats=False)
|
||||
net = res_skip()
|
||||
ckpt = torch.load(modelpath)
|
||||
for key in list(ckpt.keys()):
|
||||
if 'module.' in key:
|
||||
ckpt[key.replace('module.', '')] = ckpt[key]
|
||||
del ckpt[key]
|
||||
net.load_state_dict(ckpt)
|
||||
net.eval()
|
||||
self.model = net.to(self.device)
|
||||
|
||||
def unload_model(self):
|
||||
if self.model is not None:
|
||||
self.model.cpu()
|
||||
|
||||
def __call__(self, input_image):
|
||||
if self.model is None:
|
||||
self.load_model()
|
||||
self.model.to(self.device)
|
||||
img = cv2.cvtColor(input_image, cv2.COLOR_RGB2GRAY)
|
||||
img = np.ascontiguousarray(img.copy()).copy()
|
||||
with torch.no_grad():
|
||||
image_feed = torch.from_numpy(img).float().to(self.device)
|
||||
image_feed = rearrange(image_feed, 'h w -> 1 1 h w')
|
||||
line = self.model(image_feed)
|
||||
line = 255 - line.cpu().numpy()[0, 0]
|
||||
return line.clip(0, 255).astype(np.uint8)
|
||||
|
||||
|
||||
@@ -0,0 +1,5 @@
|
||||
from .mediapipe_face_common import generate_annotation
|
||||
|
||||
|
||||
def apply_mediapipe_face(image, max_faces: int = 1, min_confidence: float = 0.5):
|
||||
return generate_annotation(image, max_faces, min_confidence)
|
||||
@@ -0,0 +1,155 @@
|
||||
from typing import Mapping
|
||||
|
||||
import mediapipe as mp
|
||||
import numpy
|
||||
|
||||
|
||||
mp_drawing = mp.solutions.drawing_utils
|
||||
mp_drawing_styles = mp.solutions.drawing_styles
|
||||
mp_face_detection = mp.solutions.face_detection # Only for counting faces.
|
||||
mp_face_mesh = mp.solutions.face_mesh
|
||||
mp_face_connections = mp.solutions.face_mesh_connections.FACEMESH_TESSELATION
|
||||
mp_hand_connections = mp.solutions.hands_connections.HAND_CONNECTIONS
|
||||
mp_body_connections = mp.solutions.pose_connections.POSE_CONNECTIONS
|
||||
|
||||
DrawingSpec = mp.solutions.drawing_styles.DrawingSpec
|
||||
PoseLandmark = mp.solutions.drawing_styles.PoseLandmark
|
||||
|
||||
min_face_size_pixels: int = 64
|
||||
f_thick = 2
|
||||
f_rad = 1
|
||||
right_iris_draw = DrawingSpec(color=(10, 200, 250), thickness=f_thick, circle_radius=f_rad)
|
||||
right_eye_draw = DrawingSpec(color=(10, 200, 180), thickness=f_thick, circle_radius=f_rad)
|
||||
right_eyebrow_draw = DrawingSpec(color=(10, 220, 180), thickness=f_thick, circle_radius=f_rad)
|
||||
left_iris_draw = DrawingSpec(color=(250, 200, 10), thickness=f_thick, circle_radius=f_rad)
|
||||
left_eye_draw = DrawingSpec(color=(180, 200, 10), thickness=f_thick, circle_radius=f_rad)
|
||||
left_eyebrow_draw = DrawingSpec(color=(180, 220, 10), thickness=f_thick, circle_radius=f_rad)
|
||||
mouth_draw = DrawingSpec(color=(10, 180, 10), thickness=f_thick, circle_radius=f_rad)
|
||||
head_draw = DrawingSpec(color=(10, 200, 10), thickness=f_thick, circle_radius=f_rad)
|
||||
|
||||
# mp_face_mesh.FACEMESH_CONTOURS has all the items we care about.
|
||||
face_connection_spec = {}
|
||||
for edge in mp_face_mesh.FACEMESH_FACE_OVAL:
|
||||
face_connection_spec[edge] = head_draw
|
||||
for edge in mp_face_mesh.FACEMESH_LEFT_EYE:
|
||||
face_connection_spec[edge] = left_eye_draw
|
||||
for edge in mp_face_mesh.FACEMESH_LEFT_EYEBROW:
|
||||
face_connection_spec[edge] = left_eyebrow_draw
|
||||
# for edge in mp_face_mesh.FACEMESH_LEFT_IRIS:
|
||||
# face_connection_spec[edge] = left_iris_draw
|
||||
for edge in mp_face_mesh.FACEMESH_RIGHT_EYE:
|
||||
face_connection_spec[edge] = right_eye_draw
|
||||
for edge in mp_face_mesh.FACEMESH_RIGHT_EYEBROW:
|
||||
face_connection_spec[edge] = right_eyebrow_draw
|
||||
# for edge in mp_face_mesh.FACEMESH_RIGHT_IRIS:
|
||||
# face_connection_spec[edge] = right_iris_draw
|
||||
for edge in mp_face_mesh.FACEMESH_LIPS:
|
||||
face_connection_spec[edge] = mouth_draw
|
||||
iris_landmark_spec = {468: right_iris_draw, 473: left_iris_draw}
|
||||
|
||||
|
||||
def draw_pupils(image, landmark_list, drawing_spec, halfwidth: int = 2):
|
||||
"""We have a custom function to draw the pupils because the mp.draw_landmarks method requires a parameter for all
|
||||
landmarks. Until our PR is merged into mediapipe, we need this separate method."""
|
||||
if len(image.shape) != 3:
|
||||
raise ValueError("Input image must be H,W,C.")
|
||||
image_rows, image_cols, image_channels = image.shape
|
||||
if image_channels != 3: # BGR channels
|
||||
raise ValueError('Input image must contain three channel bgr data.')
|
||||
for idx, landmark in enumerate(landmark_list.landmark):
|
||||
if (
|
||||
(landmark.HasField('visibility') and landmark.visibility < 0.9) or
|
||||
(landmark.HasField('presence') and landmark.presence < 0.5)
|
||||
):
|
||||
continue
|
||||
if landmark.x >= 1.0 or landmark.x < 0 or landmark.y >= 1.0 or landmark.y < 0:
|
||||
continue
|
||||
image_x = int(image_cols*landmark.x)
|
||||
image_y = int(image_rows*landmark.y)
|
||||
draw_color = None
|
||||
if isinstance(drawing_spec, Mapping):
|
||||
if drawing_spec.get(idx) is None:
|
||||
continue
|
||||
else:
|
||||
draw_color = drawing_spec[idx].color
|
||||
elif isinstance(drawing_spec, DrawingSpec):
|
||||
draw_color = drawing_spec.color
|
||||
image[image_y-halfwidth:image_y+halfwidth, image_x-halfwidth:image_x+halfwidth, :] = draw_color
|
||||
|
||||
|
||||
def reverse_channels(image):
|
||||
"""Given a numpy array in RGB form, convert to BGR. Will also convert from BGR to RGB."""
|
||||
# im[:,:,::-1] is a neat hack to convert BGR to RGB by reversing the indexing order.
|
||||
# im[:,:,::[2,1,0]] would also work but makes a copy of the data.
|
||||
return image[:, :, ::-1]
|
||||
|
||||
|
||||
def generate_annotation(
|
||||
img_rgb,
|
||||
max_faces: int,
|
||||
min_confidence: float
|
||||
):
|
||||
"""
|
||||
Find up to 'max_faces' inside the provided input image.
|
||||
If min_face_size_pixels is provided and nonzero it will be used to filter faces that occupy less than this many
|
||||
pixels in the image.
|
||||
"""
|
||||
with mp_face_mesh.FaceMesh(
|
||||
static_image_mode=True,
|
||||
max_num_faces=max_faces,
|
||||
refine_landmarks=True,
|
||||
min_detection_confidence=min_confidence,
|
||||
) as facemesh:
|
||||
img_height, img_width, img_channels = img_rgb.shape
|
||||
assert(img_channels == 3)
|
||||
|
||||
results = facemesh.process(img_rgb).multi_face_landmarks
|
||||
|
||||
if results is None:
|
||||
print("No faces detected in controlnet image for Mediapipe face annotator.")
|
||||
return numpy.zeros_like(img_rgb)
|
||||
|
||||
# Filter faces that are too small
|
||||
filtered_landmarks = []
|
||||
for lm in results:
|
||||
landmarks = lm.landmark
|
||||
face_rect = [
|
||||
landmarks[0].x,
|
||||
landmarks[0].y,
|
||||
landmarks[0].x,
|
||||
landmarks[0].y,
|
||||
] # Left, up, right, down.
|
||||
for i in range(len(landmarks)):
|
||||
face_rect[0] = min(face_rect[0], landmarks[i].x)
|
||||
face_rect[1] = min(face_rect[1], landmarks[i].y)
|
||||
face_rect[2] = max(face_rect[2], landmarks[i].x)
|
||||
face_rect[3] = max(face_rect[3], landmarks[i].y)
|
||||
if min_face_size_pixels > 0:
|
||||
face_width = abs(face_rect[2] - face_rect[0])
|
||||
face_height = abs(face_rect[3] - face_rect[1])
|
||||
face_width_pixels = face_width * img_width
|
||||
face_height_pixels = face_height * img_height
|
||||
face_size = min(face_width_pixels, face_height_pixels)
|
||||
if face_size >= min_face_size_pixels:
|
||||
filtered_landmarks.append(lm)
|
||||
else:
|
||||
filtered_landmarks.append(lm)
|
||||
|
||||
# Annotations are drawn in BGR for some reason, but we don't need to flip a zero-filled image at the start.
|
||||
empty = numpy.zeros_like(img_rgb)
|
||||
|
||||
# Draw detected faces:
|
||||
for face_landmarks in filtered_landmarks:
|
||||
mp_drawing.draw_landmarks(
|
||||
empty,
|
||||
face_landmarks,
|
||||
connections=face_connection_spec.keys(),
|
||||
landmark_drawing_spec=None,
|
||||
connection_drawing_spec=face_connection_spec
|
||||
)
|
||||
draw_pupils(empty, face_landmarks, iris_landmark_spec, 2)
|
||||
|
||||
# Flip BGR back to RGB.
|
||||
empty = reverse_channels(empty).copy()
|
||||
|
||||
return empty
|
||||
21
extensions-builtin/forge_legacy_preprocessors/annotator/midas/LICENSE
Executable file
21
extensions-builtin/forge_legacy_preprocessors/annotator/midas/LICENSE
Executable file
@@ -0,0 +1,21 @@
|
||||
MIT License
|
||||
|
||||
Copyright (c) 2019 Intel ISL (Intel Intelligent Systems Lab)
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
of this software and associated documentation files (the "Software"), to deal
|
||||
in the Software without restriction, including without limitation the rights
|
||||
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
copies of the Software, and to permit persons to whom the Software is
|
||||
furnished to do so, subject to the following conditions:
|
||||
|
||||
The above copyright notice and this permission notice shall be included in all
|
||||
copies or substantial portions of the Software.
|
||||
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
||||
SOFTWARE.
|
||||
49
extensions-builtin/forge_legacy_preprocessors/annotator/midas/__init__.py
Executable file
49
extensions-builtin/forge_legacy_preprocessors/annotator/midas/__init__.py
Executable file
@@ -0,0 +1,49 @@
|
||||
import cv2
|
||||
import numpy as np
|
||||
import torch
|
||||
|
||||
from einops import rearrange
|
||||
from .api import MiDaSInference
|
||||
from modules import devices
|
||||
|
||||
model = None
|
||||
|
||||
def unload_midas_model():
|
||||
global model
|
||||
if model is not None:
|
||||
model = model.cpu()
|
||||
|
||||
def apply_midas(input_image, a=np.pi * 2.0, bg_th=0.1):
|
||||
global model
|
||||
if model is None:
|
||||
model = MiDaSInference(model_type="dpt_hybrid")
|
||||
if devices.get_device_for("controlnet").type != 'mps':
|
||||
model = model.to(devices.get_device_for("controlnet"))
|
||||
|
||||
assert input_image.ndim == 3
|
||||
image_depth = input_image
|
||||
with torch.no_grad():
|
||||
image_depth = torch.from_numpy(image_depth).float()
|
||||
if devices.get_device_for("controlnet").type != 'mps':
|
||||
image_depth = image_depth.to(devices.get_device_for("controlnet"))
|
||||
image_depth = image_depth / 127.5 - 1.0
|
||||
image_depth = rearrange(image_depth, 'h w c -> 1 c h w')
|
||||
depth = model(image_depth)[0]
|
||||
|
||||
depth_pt = depth.clone()
|
||||
depth_pt -= torch.min(depth_pt)
|
||||
depth_pt /= torch.max(depth_pt)
|
||||
depth_pt = depth_pt.cpu().numpy()
|
||||
depth_image = (depth_pt * 255.0).clip(0, 255).astype(np.uint8)
|
||||
|
||||
depth_np = depth.cpu().numpy()
|
||||
x = cv2.Sobel(depth_np, cv2.CV_32F, 1, 0, ksize=3)
|
||||
y = cv2.Sobel(depth_np, cv2.CV_32F, 0, 1, ksize=3)
|
||||
z = np.ones_like(x) * a
|
||||
x[depth_pt < bg_th] = 0
|
||||
y[depth_pt < bg_th] = 0
|
||||
normal = np.stack([x, y, z], axis=2)
|
||||
normal /= np.sum(normal ** 2.0, axis=2, keepdims=True) ** 0.5
|
||||
normal_image = (normal * 127.5 + 127.5).clip(0, 255).astype(np.uint8)[:, :, ::-1]
|
||||
|
||||
return depth_image, normal_image
|
||||
181
extensions-builtin/forge_legacy_preprocessors/annotator/midas/api.py
Executable file
181
extensions-builtin/forge_legacy_preprocessors/annotator/midas/api.py
Executable file
@@ -0,0 +1,181 @@
|
||||
# based on https://github.com/isl-org/MiDaS
|
||||
|
||||
import cv2
|
||||
import torch
|
||||
import torch.nn as nn
|
||||
import os
|
||||
from annotator.annotator_path import models_path
|
||||
|
||||
from torchvision.transforms import Compose
|
||||
|
||||
from .midas.dpt_depth import DPTDepthModel
|
||||
from .midas.midas_net import MidasNet
|
||||
from .midas.midas_net_custom import MidasNet_small
|
||||
from .midas.transforms import Resize, NormalizeImage, PrepareForNet
|
||||
|
||||
base_model_path = os.path.join(models_path, "midas")
|
||||
old_modeldir = os.path.dirname(os.path.realpath(__file__))
|
||||
remote_model_path = "https://huggingface.co/lllyasviel/ControlNet/resolve/main/annotator/ckpts/dpt_hybrid-midas-501f0c75.pt"
|
||||
|
||||
ISL_PATHS = {
|
||||
"dpt_large": os.path.join(base_model_path, "dpt_large-midas-2f21e586.pt"),
|
||||
"dpt_hybrid": os.path.join(base_model_path, "dpt_hybrid-midas-501f0c75.pt"),
|
||||
"midas_v21": "",
|
||||
"midas_v21_small": "",
|
||||
}
|
||||
|
||||
OLD_ISL_PATHS = {
|
||||
"dpt_large": os.path.join(old_modeldir, "dpt_large-midas-2f21e586.pt"),
|
||||
"dpt_hybrid": os.path.join(old_modeldir, "dpt_hybrid-midas-501f0c75.pt"),
|
||||
"midas_v21": "",
|
||||
"midas_v21_small": "",
|
||||
}
|
||||
|
||||
|
||||
def disabled_train(self, mode=True):
|
||||
"""Overwrite model.train with this function to make sure train/eval mode
|
||||
does not change anymore."""
|
||||
return self
|
||||
|
||||
|
||||
def load_midas_transform(model_type):
|
||||
# https://github.com/isl-org/MiDaS/blob/master/run.py
|
||||
# load transform only
|
||||
if model_type == "dpt_large": # DPT-Large
|
||||
net_w, net_h = 384, 384
|
||||
resize_mode = "minimal"
|
||||
normalization = NormalizeImage(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5])
|
||||
|
||||
elif model_type == "dpt_hybrid": # DPT-Hybrid
|
||||
net_w, net_h = 384, 384
|
||||
resize_mode = "minimal"
|
||||
normalization = NormalizeImage(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5])
|
||||
|
||||
elif model_type == "midas_v21":
|
||||
net_w, net_h = 384, 384
|
||||
resize_mode = "upper_bound"
|
||||
normalization = NormalizeImage(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
|
||||
|
||||
elif model_type == "midas_v21_small":
|
||||
net_w, net_h = 256, 256
|
||||
resize_mode = "upper_bound"
|
||||
normalization = NormalizeImage(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
|
||||
|
||||
else:
|
||||
assert False, f"model_type '{model_type}' not implemented, use: --model_type large"
|
||||
|
||||
transform = Compose(
|
||||
[
|
||||
Resize(
|
||||
net_w,
|
||||
net_h,
|
||||
resize_target=None,
|
||||
keep_aspect_ratio=True,
|
||||
ensure_multiple_of=32,
|
||||
resize_method=resize_mode,
|
||||
image_interpolation_method=cv2.INTER_CUBIC,
|
||||
),
|
||||
normalization,
|
||||
PrepareForNet(),
|
||||
]
|
||||
)
|
||||
|
||||
return transform
|
||||
|
||||
|
||||
def load_model(model_type):
|
||||
# https://github.com/isl-org/MiDaS/blob/master/run.py
|
||||
# load network
|
||||
model_path = ISL_PATHS[model_type]
|
||||
old_model_path = OLD_ISL_PATHS[model_type]
|
||||
if model_type == "dpt_large": # DPT-Large
|
||||
model = DPTDepthModel(
|
||||
path=model_path,
|
||||
backbone="vitl16_384",
|
||||
non_negative=True,
|
||||
)
|
||||
net_w, net_h = 384, 384
|
||||
resize_mode = "minimal"
|
||||
normalization = NormalizeImage(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5])
|
||||
|
||||
elif model_type == "dpt_hybrid": # DPT-Hybrid
|
||||
if os.path.exists(old_model_path):
|
||||
model_path = old_model_path
|
||||
elif not os.path.exists(model_path):
|
||||
from modules.modelloader import load_file_from_url
|
||||
load_file_from_url(remote_model_path, model_dir=base_model_path)
|
||||
|
||||
model = DPTDepthModel(
|
||||
path=model_path,
|
||||
backbone="vitb_rn50_384",
|
||||
non_negative=True,
|
||||
)
|
||||
net_w, net_h = 384, 384
|
||||
resize_mode = "minimal"
|
||||
normalization = NormalizeImage(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5])
|
||||
|
||||
elif model_type == "midas_v21":
|
||||
model = MidasNet(model_path, non_negative=True)
|
||||
net_w, net_h = 384, 384
|
||||
resize_mode = "upper_bound"
|
||||
normalization = NormalizeImage(
|
||||
mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]
|
||||
)
|
||||
|
||||
elif model_type == "midas_v21_small":
|
||||
model = MidasNet_small(model_path, features=64, backbone="efficientnet_lite3", exportable=True,
|
||||
non_negative=True, blocks={'expand': True})
|
||||
net_w, net_h = 256, 256
|
||||
resize_mode = "upper_bound"
|
||||
normalization = NormalizeImage(
|
||||
mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]
|
||||
)
|
||||
|
||||
else:
|
||||
print(f"model_type '{model_type}' not implemented, use: --model_type large")
|
||||
assert False
|
||||
|
||||
transform = Compose(
|
||||
[
|
||||
Resize(
|
||||
net_w,
|
||||
net_h,
|
||||
resize_target=None,
|
||||
keep_aspect_ratio=True,
|
||||
ensure_multiple_of=32,
|
||||
resize_method=resize_mode,
|
||||
image_interpolation_method=cv2.INTER_CUBIC,
|
||||
),
|
||||
normalization,
|
||||
PrepareForNet(),
|
||||
]
|
||||
)
|
||||
|
||||
return model.eval(), transform
|
||||
|
||||
|
||||
class MiDaSInference(nn.Module):
|
||||
MODEL_TYPES_TORCH_HUB = [
|
||||
"DPT_Large",
|
||||
"DPT_Hybrid",
|
||||
"MiDaS_small"
|
||||
]
|
||||
MODEL_TYPES_ISL = [
|
||||
"dpt_large",
|
||||
"dpt_hybrid",
|
||||
"midas_v21",
|
||||
"midas_v21_small",
|
||||
]
|
||||
|
||||
def __init__(self, model_type):
|
||||
super().__init__()
|
||||
assert (model_type in self.MODEL_TYPES_ISL)
|
||||
model, _ = load_model(model_type)
|
||||
self.model = model
|
||||
self.model.train = disabled_train
|
||||
|
||||
def forward(self, x):
|
||||
with torch.no_grad():
|
||||
prediction = self.model(x)
|
||||
return prediction
|
||||
|
||||
@@ -0,0 +1,16 @@
|
||||
import torch
|
||||
|
||||
|
||||
class BaseModel(torch.nn.Module):
|
||||
def load(self, path):
|
||||
"""Load model from file.
|
||||
|
||||
Args:
|
||||
path (str): file path
|
||||
"""
|
||||
parameters = torch.load(path, map_location=torch.device('cpu'))
|
||||
|
||||
if "optimizer" in parameters:
|
||||
parameters = parameters["model"]
|
||||
|
||||
self.load_state_dict(parameters)
|
||||
342
extensions-builtin/forge_legacy_preprocessors/annotator/midas/midas/blocks.py
Executable file
342
extensions-builtin/forge_legacy_preprocessors/annotator/midas/midas/blocks.py
Executable file
@@ -0,0 +1,342 @@
|
||||
import torch
|
||||
import torch.nn as nn
|
||||
|
||||
from .vit import (
|
||||
_make_pretrained_vitb_rn50_384,
|
||||
_make_pretrained_vitl16_384,
|
||||
_make_pretrained_vitb16_384,
|
||||
forward_vit,
|
||||
)
|
||||
|
||||
def _make_encoder(backbone, features, use_pretrained, groups=1, expand=False, exportable=True, hooks=None, use_vit_only=False, use_readout="ignore",):
|
||||
if backbone == "vitl16_384":
|
||||
pretrained = _make_pretrained_vitl16_384(
|
||||
use_pretrained, hooks=hooks, use_readout=use_readout
|
||||
)
|
||||
scratch = _make_scratch(
|
||||
[256, 512, 1024, 1024], features, groups=groups, expand=expand
|
||||
) # ViT-L/16 - 85.0% Top1 (backbone)
|
||||
elif backbone == "vitb_rn50_384":
|
||||
pretrained = _make_pretrained_vitb_rn50_384(
|
||||
use_pretrained,
|
||||
hooks=hooks,
|
||||
use_vit_only=use_vit_only,
|
||||
use_readout=use_readout,
|
||||
)
|
||||
scratch = _make_scratch(
|
||||
[256, 512, 768, 768], features, groups=groups, expand=expand
|
||||
) # ViT-H/16 - 85.0% Top1 (backbone)
|
||||
elif backbone == "vitb16_384":
|
||||
pretrained = _make_pretrained_vitb16_384(
|
||||
use_pretrained, hooks=hooks, use_readout=use_readout
|
||||
)
|
||||
scratch = _make_scratch(
|
||||
[96, 192, 384, 768], features, groups=groups, expand=expand
|
||||
) # ViT-B/16 - 84.6% Top1 (backbone)
|
||||
elif backbone == "resnext101_wsl":
|
||||
pretrained = _make_pretrained_resnext101_wsl(use_pretrained)
|
||||
scratch = _make_scratch([256, 512, 1024, 2048], features, groups=groups, expand=expand) # efficientnet_lite3
|
||||
elif backbone == "efficientnet_lite3":
|
||||
pretrained = _make_pretrained_efficientnet_lite3(use_pretrained, exportable=exportable)
|
||||
scratch = _make_scratch([32, 48, 136, 384], features, groups=groups, expand=expand) # efficientnet_lite3
|
||||
else:
|
||||
print(f"Backbone '{backbone}' not implemented")
|
||||
assert False
|
||||
|
||||
return pretrained, scratch
|
||||
|
||||
|
||||
def _make_scratch(in_shape, out_shape, groups=1, expand=False):
|
||||
scratch = nn.Module()
|
||||
|
||||
out_shape1 = out_shape
|
||||
out_shape2 = out_shape
|
||||
out_shape3 = out_shape
|
||||
out_shape4 = out_shape
|
||||
if expand==True:
|
||||
out_shape1 = out_shape
|
||||
out_shape2 = out_shape*2
|
||||
out_shape3 = out_shape*4
|
||||
out_shape4 = out_shape*8
|
||||
|
||||
scratch.layer1_rn = nn.Conv2d(
|
||||
in_shape[0], out_shape1, kernel_size=3, stride=1, padding=1, bias=False, groups=groups
|
||||
)
|
||||
scratch.layer2_rn = nn.Conv2d(
|
||||
in_shape[1], out_shape2, kernel_size=3, stride=1, padding=1, bias=False, groups=groups
|
||||
)
|
||||
scratch.layer3_rn = nn.Conv2d(
|
||||
in_shape[2], out_shape3, kernel_size=3, stride=1, padding=1, bias=False, groups=groups
|
||||
)
|
||||
scratch.layer4_rn = nn.Conv2d(
|
||||
in_shape[3], out_shape4, kernel_size=3, stride=1, padding=1, bias=False, groups=groups
|
||||
)
|
||||
|
||||
return scratch
|
||||
|
||||
|
||||
def _make_pretrained_efficientnet_lite3(use_pretrained, exportable=False):
|
||||
efficientnet = torch.hub.load(
|
||||
"rwightman/gen-efficientnet-pytorch",
|
||||
"tf_efficientnet_lite3",
|
||||
pretrained=use_pretrained,
|
||||
exportable=exportable
|
||||
)
|
||||
return _make_efficientnet_backbone(efficientnet)
|
||||
|
||||
|
||||
def _make_efficientnet_backbone(effnet):
|
||||
pretrained = nn.Module()
|
||||
|
||||
pretrained.layer1 = nn.Sequential(
|
||||
effnet.conv_stem, effnet.bn1, effnet.act1, *effnet.blocks[0:2]
|
||||
)
|
||||
pretrained.layer2 = nn.Sequential(*effnet.blocks[2:3])
|
||||
pretrained.layer3 = nn.Sequential(*effnet.blocks[3:5])
|
||||
pretrained.layer4 = nn.Sequential(*effnet.blocks[5:9])
|
||||
|
||||
return pretrained
|
||||
|
||||
|
||||
def _make_resnet_backbone(resnet):
|
||||
pretrained = nn.Module()
|
||||
pretrained.layer1 = nn.Sequential(
|
||||
resnet.conv1, resnet.bn1, resnet.relu, resnet.maxpool, resnet.layer1
|
||||
)
|
||||
|
||||
pretrained.layer2 = resnet.layer2
|
||||
pretrained.layer3 = resnet.layer3
|
||||
pretrained.layer4 = resnet.layer4
|
||||
|
||||
return pretrained
|
||||
|
||||
|
||||
def _make_pretrained_resnext101_wsl(use_pretrained):
|
||||
resnet = torch.hub.load("facebookresearch/WSL-Images", "resnext101_32x8d_wsl")
|
||||
return _make_resnet_backbone(resnet)
|
||||
|
||||
|
||||
|
||||
class Interpolate(nn.Module):
|
||||
"""Interpolation module.
|
||||
"""
|
||||
|
||||
def __init__(self, scale_factor, mode, align_corners=False):
|
||||
"""Init.
|
||||
|
||||
Args:
|
||||
scale_factor (float): scaling
|
||||
mode (str): interpolation mode
|
||||
"""
|
||||
super(Interpolate, self).__init__()
|
||||
|
||||
self.interp = nn.functional.interpolate
|
||||
self.scale_factor = scale_factor
|
||||
self.mode = mode
|
||||
self.align_corners = align_corners
|
||||
|
||||
def forward(self, x):
|
||||
"""Forward pass.
|
||||
|
||||
Args:
|
||||
x (tensor): input
|
||||
|
||||
Returns:
|
||||
tensor: interpolated data
|
||||
"""
|
||||
|
||||
x = self.interp(
|
||||
x, scale_factor=self.scale_factor, mode=self.mode, align_corners=self.align_corners
|
||||
)
|
||||
|
||||
return x
|
||||
|
||||
|
||||
class ResidualConvUnit(nn.Module):
|
||||
"""Residual convolution module.
|
||||
"""
|
||||
|
||||
def __init__(self, features):
|
||||
"""Init.
|
||||
|
||||
Args:
|
||||
features (int): number of features
|
||||
"""
|
||||
super().__init__()
|
||||
|
||||
self.conv1 = nn.Conv2d(
|
||||
features, features, kernel_size=3, stride=1, padding=1, bias=True
|
||||
)
|
||||
|
||||
self.conv2 = nn.Conv2d(
|
||||
features, features, kernel_size=3, stride=1, padding=1, bias=True
|
||||
)
|
||||
|
||||
self.relu = nn.ReLU(inplace=True)
|
||||
|
||||
def forward(self, x):
|
||||
"""Forward pass.
|
||||
|
||||
Args:
|
||||
x (tensor): input
|
||||
|
||||
Returns:
|
||||
tensor: output
|
||||
"""
|
||||
out = self.relu(x)
|
||||
out = self.conv1(out)
|
||||
out = self.relu(out)
|
||||
out = self.conv2(out)
|
||||
|
||||
return out + x
|
||||
|
||||
|
||||
class FeatureFusionBlock(nn.Module):
|
||||
"""Feature fusion block.
|
||||
"""
|
||||
|
||||
def __init__(self, features):
|
||||
"""Init.
|
||||
|
||||
Args:
|
||||
features (int): number of features
|
||||
"""
|
||||
super(FeatureFusionBlock, self).__init__()
|
||||
|
||||
self.resConfUnit1 = ResidualConvUnit(features)
|
||||
self.resConfUnit2 = ResidualConvUnit(features)
|
||||
|
||||
def forward(self, *xs):
|
||||
"""Forward pass.
|
||||
|
||||
Returns:
|
||||
tensor: output
|
||||
"""
|
||||
output = xs[0]
|
||||
|
||||
if len(xs) == 2:
|
||||
output += self.resConfUnit1(xs[1])
|
||||
|
||||
output = self.resConfUnit2(output)
|
||||
|
||||
output = nn.functional.interpolate(
|
||||
output, scale_factor=2, mode="bilinear", align_corners=True
|
||||
)
|
||||
|
||||
return output
|
||||
|
||||
|
||||
|
||||
|
||||
class ResidualConvUnit_custom(nn.Module):
|
||||
"""Residual convolution module.
|
||||
"""
|
||||
|
||||
def __init__(self, features, activation, bn):
|
||||
"""Init.
|
||||
|
||||
Args:
|
||||
features (int): number of features
|
||||
"""
|
||||
super().__init__()
|
||||
|
||||
self.bn = bn
|
||||
|
||||
self.groups=1
|
||||
|
||||
self.conv1 = nn.Conv2d(
|
||||
features, features, kernel_size=3, stride=1, padding=1, bias=True, groups=self.groups
|
||||
)
|
||||
|
||||
self.conv2 = nn.Conv2d(
|
||||
features, features, kernel_size=3, stride=1, padding=1, bias=True, groups=self.groups
|
||||
)
|
||||
|
||||
if self.bn==True:
|
||||
self.bn1 = nn.BatchNorm2d(features)
|
||||
self.bn2 = nn.BatchNorm2d(features)
|
||||
|
||||
self.activation = activation
|
||||
|
||||
self.skip_add = nn.quantized.FloatFunctional()
|
||||
|
||||
def forward(self, x):
|
||||
"""Forward pass.
|
||||
|
||||
Args:
|
||||
x (tensor): input
|
||||
|
||||
Returns:
|
||||
tensor: output
|
||||
"""
|
||||
|
||||
out = self.activation(x)
|
||||
out = self.conv1(out)
|
||||
if self.bn==True:
|
||||
out = self.bn1(out)
|
||||
|
||||
out = self.activation(out)
|
||||
out = self.conv2(out)
|
||||
if self.bn==True:
|
||||
out = self.bn2(out)
|
||||
|
||||
if self.groups > 1:
|
||||
out = self.conv_merge(out)
|
||||
|
||||
return self.skip_add.add(out, x)
|
||||
|
||||
# return out + x
|
||||
|
||||
|
||||
class FeatureFusionBlock_custom(nn.Module):
|
||||
"""Feature fusion block.
|
||||
"""
|
||||
|
||||
def __init__(self, features, activation, deconv=False, bn=False, expand=False, align_corners=True):
|
||||
"""Init.
|
||||
|
||||
Args:
|
||||
features (int): number of features
|
||||
"""
|
||||
super(FeatureFusionBlock_custom, self).__init__()
|
||||
|
||||
self.deconv = deconv
|
||||
self.align_corners = align_corners
|
||||
|
||||
self.groups=1
|
||||
|
||||
self.expand = expand
|
||||
out_features = features
|
||||
if self.expand==True:
|
||||
out_features = features//2
|
||||
|
||||
self.out_conv = nn.Conv2d(features, out_features, kernel_size=1, stride=1, padding=0, bias=True, groups=1)
|
||||
|
||||
self.resConfUnit1 = ResidualConvUnit_custom(features, activation, bn)
|
||||
self.resConfUnit2 = ResidualConvUnit_custom(features, activation, bn)
|
||||
|
||||
self.skip_add = nn.quantized.FloatFunctional()
|
||||
|
||||
def forward(self, *xs):
|
||||
"""Forward pass.
|
||||
|
||||
Returns:
|
||||
tensor: output
|
||||
"""
|
||||
output = xs[0]
|
||||
|
||||
if len(xs) == 2:
|
||||
res = self.resConfUnit1(xs[1])
|
||||
output = self.skip_add.add(output, res)
|
||||
# output += res
|
||||
|
||||
output = self.resConfUnit2(output)
|
||||
|
||||
output = nn.functional.interpolate(
|
||||
output, scale_factor=2, mode="bilinear", align_corners=self.align_corners
|
||||
)
|
||||
|
||||
output = self.out_conv(output)
|
||||
|
||||
return output
|
||||
|
||||
109
extensions-builtin/forge_legacy_preprocessors/annotator/midas/midas/dpt_depth.py
Executable file
109
extensions-builtin/forge_legacy_preprocessors/annotator/midas/midas/dpt_depth.py
Executable file
@@ -0,0 +1,109 @@
|
||||
import torch
|
||||
import torch.nn as nn
|
||||
import torch.nn.functional as F
|
||||
|
||||
from .base_model import BaseModel
|
||||
from .blocks import (
|
||||
FeatureFusionBlock,
|
||||
FeatureFusionBlock_custom,
|
||||
Interpolate,
|
||||
_make_encoder,
|
||||
forward_vit,
|
||||
)
|
||||
|
||||
|
||||
def _make_fusion_block(features, use_bn):
|
||||
return FeatureFusionBlock_custom(
|
||||
features,
|
||||
nn.ReLU(False),
|
||||
deconv=False,
|
||||
bn=use_bn,
|
||||
expand=False,
|
||||
align_corners=True,
|
||||
)
|
||||
|
||||
|
||||
class DPT(BaseModel):
|
||||
def __init__(
|
||||
self,
|
||||
head,
|
||||
features=256,
|
||||
backbone="vitb_rn50_384",
|
||||
readout="project",
|
||||
channels_last=False,
|
||||
use_bn=False,
|
||||
):
|
||||
|
||||
super(DPT, self).__init__()
|
||||
|
||||
self.channels_last = channels_last
|
||||
|
||||
hooks = {
|
||||
"vitb_rn50_384": [0, 1, 8, 11],
|
||||
"vitb16_384": [2, 5, 8, 11],
|
||||
"vitl16_384": [5, 11, 17, 23],
|
||||
}
|
||||
|
||||
# Instantiate backbone and reassemble blocks
|
||||
self.pretrained, self.scratch = _make_encoder(
|
||||
backbone,
|
||||
features,
|
||||
False, # Set to true of you want to train from scratch, uses ImageNet weights
|
||||
groups=1,
|
||||
expand=False,
|
||||
exportable=False,
|
||||
hooks=hooks[backbone],
|
||||
use_readout=readout,
|
||||
)
|
||||
|
||||
self.scratch.refinenet1 = _make_fusion_block(features, use_bn)
|
||||
self.scratch.refinenet2 = _make_fusion_block(features, use_bn)
|
||||
self.scratch.refinenet3 = _make_fusion_block(features, use_bn)
|
||||
self.scratch.refinenet4 = _make_fusion_block(features, use_bn)
|
||||
|
||||
self.scratch.output_conv = head
|
||||
|
||||
|
||||
def forward(self, x):
|
||||
if self.channels_last == True:
|
||||
x.contiguous(memory_format=torch.channels_last)
|
||||
|
||||
layer_1, layer_2, layer_3, layer_4 = forward_vit(self.pretrained, x)
|
||||
|
||||
layer_1_rn = self.scratch.layer1_rn(layer_1)
|
||||
layer_2_rn = self.scratch.layer2_rn(layer_2)
|
||||
layer_3_rn = self.scratch.layer3_rn(layer_3)
|
||||
layer_4_rn = self.scratch.layer4_rn(layer_4)
|
||||
|
||||
path_4 = self.scratch.refinenet4(layer_4_rn)
|
||||
path_3 = self.scratch.refinenet3(path_4, layer_3_rn)
|
||||
path_2 = self.scratch.refinenet2(path_3, layer_2_rn)
|
||||
path_1 = self.scratch.refinenet1(path_2, layer_1_rn)
|
||||
|
||||
out = self.scratch.output_conv(path_1)
|
||||
|
||||
return out
|
||||
|
||||
|
||||
class DPTDepthModel(DPT):
|
||||
def __init__(self, path=None, non_negative=True, **kwargs):
|
||||
features = kwargs["features"] if "features" in kwargs else 256
|
||||
|
||||
head = nn.Sequential(
|
||||
nn.Conv2d(features, features // 2, kernel_size=3, stride=1, padding=1),
|
||||
Interpolate(scale_factor=2, mode="bilinear", align_corners=True),
|
||||
nn.Conv2d(features // 2, 32, kernel_size=3, stride=1, padding=1),
|
||||
nn.ReLU(True),
|
||||
nn.Conv2d(32, 1, kernel_size=1, stride=1, padding=0),
|
||||
nn.ReLU(True) if non_negative else nn.Identity(),
|
||||
nn.Identity(),
|
||||
)
|
||||
|
||||
super().__init__(head, **kwargs)
|
||||
|
||||
if path is not None:
|
||||
self.load(path)
|
||||
|
||||
def forward(self, x):
|
||||
return super().forward(x).squeeze(dim=1)
|
||||
|
||||
@@ -0,0 +1,76 @@
|
||||
"""MidashNet: Network for monocular depth estimation trained by mixing several datasets.
|
||||
This file contains code that is adapted from
|
||||
https://github.com/thomasjpfan/pytorch_refinenet/blob/master/pytorch_refinenet/refinenet/refinenet_4cascade.py
|
||||
"""
|
||||
import torch
|
||||
import torch.nn as nn
|
||||
|
||||
from .base_model import BaseModel
|
||||
from .blocks import FeatureFusionBlock, Interpolate, _make_encoder
|
||||
|
||||
|
||||
class MidasNet(BaseModel):
|
||||
"""Network for monocular depth estimation.
|
||||
"""
|
||||
|
||||
def __init__(self, path=None, features=256, non_negative=True):
|
||||
"""Init.
|
||||
|
||||
Args:
|
||||
path (str, optional): Path to saved model. Defaults to None.
|
||||
features (int, optional): Number of features. Defaults to 256.
|
||||
backbone (str, optional): Backbone network for encoder. Defaults to resnet50
|
||||
"""
|
||||
print("Loading weights: ", path)
|
||||
|
||||
super(MidasNet, self).__init__()
|
||||
|
||||
use_pretrained = False if path is None else True
|
||||
|
||||
self.pretrained, self.scratch = _make_encoder(backbone="resnext101_wsl", features=features, use_pretrained=use_pretrained)
|
||||
|
||||
self.scratch.refinenet4 = FeatureFusionBlock(features)
|
||||
self.scratch.refinenet3 = FeatureFusionBlock(features)
|
||||
self.scratch.refinenet2 = FeatureFusionBlock(features)
|
||||
self.scratch.refinenet1 = FeatureFusionBlock(features)
|
||||
|
||||
self.scratch.output_conv = nn.Sequential(
|
||||
nn.Conv2d(features, 128, kernel_size=3, stride=1, padding=1),
|
||||
Interpolate(scale_factor=2, mode="bilinear"),
|
||||
nn.Conv2d(128, 32, kernel_size=3, stride=1, padding=1),
|
||||
nn.ReLU(True),
|
||||
nn.Conv2d(32, 1, kernel_size=1, stride=1, padding=0),
|
||||
nn.ReLU(True) if non_negative else nn.Identity(),
|
||||
)
|
||||
|
||||
if path:
|
||||
self.load(path)
|
||||
|
||||
def forward(self, x):
|
||||
"""Forward pass.
|
||||
|
||||
Args:
|
||||
x (tensor): input data (image)
|
||||
|
||||
Returns:
|
||||
tensor: depth
|
||||
"""
|
||||
|
||||
layer_1 = self.pretrained.layer1(x)
|
||||
layer_2 = self.pretrained.layer2(layer_1)
|
||||
layer_3 = self.pretrained.layer3(layer_2)
|
||||
layer_4 = self.pretrained.layer4(layer_3)
|
||||
|
||||
layer_1_rn = self.scratch.layer1_rn(layer_1)
|
||||
layer_2_rn = self.scratch.layer2_rn(layer_2)
|
||||
layer_3_rn = self.scratch.layer3_rn(layer_3)
|
||||
layer_4_rn = self.scratch.layer4_rn(layer_4)
|
||||
|
||||
path_4 = self.scratch.refinenet4(layer_4_rn)
|
||||
path_3 = self.scratch.refinenet3(path_4, layer_3_rn)
|
||||
path_2 = self.scratch.refinenet2(path_3, layer_2_rn)
|
||||
path_1 = self.scratch.refinenet1(path_2, layer_1_rn)
|
||||
|
||||
out = self.scratch.output_conv(path_1)
|
||||
|
||||
return torch.squeeze(out, dim=1)
|
||||
@@ -0,0 +1,128 @@
|
||||
"""MidashNet: Network for monocular depth estimation trained by mixing several datasets.
|
||||
This file contains code that is adapted from
|
||||
https://github.com/thomasjpfan/pytorch_refinenet/blob/master/pytorch_refinenet/refinenet/refinenet_4cascade.py
|
||||
"""
|
||||
import torch
|
||||
import torch.nn as nn
|
||||
|
||||
from .base_model import BaseModel
|
||||
from .blocks import FeatureFusionBlock, FeatureFusionBlock_custom, Interpolate, _make_encoder
|
||||
|
||||
|
||||
class MidasNet_small(BaseModel):
|
||||
"""Network for monocular depth estimation.
|
||||
"""
|
||||
|
||||
def __init__(self, path=None, features=64, backbone="efficientnet_lite3", non_negative=True, exportable=True, channels_last=False, align_corners=True,
|
||||
blocks={'expand': True}):
|
||||
"""Init.
|
||||
|
||||
Args:
|
||||
path (str, optional): Path to saved model. Defaults to None.
|
||||
features (int, optional): Number of features. Defaults to 256.
|
||||
backbone (str, optional): Backbone network for encoder. Defaults to resnet50
|
||||
"""
|
||||
print("Loading weights: ", path)
|
||||
|
||||
super(MidasNet_small, self).__init__()
|
||||
|
||||
use_pretrained = False if path else True
|
||||
|
||||
self.channels_last = channels_last
|
||||
self.blocks = blocks
|
||||
self.backbone = backbone
|
||||
|
||||
self.groups = 1
|
||||
|
||||
features1=features
|
||||
features2=features
|
||||
features3=features
|
||||
features4=features
|
||||
self.expand = False
|
||||
if "expand" in self.blocks and self.blocks['expand'] == True:
|
||||
self.expand = True
|
||||
features1=features
|
||||
features2=features*2
|
||||
features3=features*4
|
||||
features4=features*8
|
||||
|
||||
self.pretrained, self.scratch = _make_encoder(self.backbone, features, use_pretrained, groups=self.groups, expand=self.expand, exportable=exportable)
|
||||
|
||||
self.scratch.activation = nn.ReLU(False)
|
||||
|
||||
self.scratch.refinenet4 = FeatureFusionBlock_custom(features4, self.scratch.activation, deconv=False, bn=False, expand=self.expand, align_corners=align_corners)
|
||||
self.scratch.refinenet3 = FeatureFusionBlock_custom(features3, self.scratch.activation, deconv=False, bn=False, expand=self.expand, align_corners=align_corners)
|
||||
self.scratch.refinenet2 = FeatureFusionBlock_custom(features2, self.scratch.activation, deconv=False, bn=False, expand=self.expand, align_corners=align_corners)
|
||||
self.scratch.refinenet1 = FeatureFusionBlock_custom(features1, self.scratch.activation, deconv=False, bn=False, align_corners=align_corners)
|
||||
|
||||
|
||||
self.scratch.output_conv = nn.Sequential(
|
||||
nn.Conv2d(features, features//2, kernel_size=3, stride=1, padding=1, groups=self.groups),
|
||||
Interpolate(scale_factor=2, mode="bilinear"),
|
||||
nn.Conv2d(features//2, 32, kernel_size=3, stride=1, padding=1),
|
||||
self.scratch.activation,
|
||||
nn.Conv2d(32, 1, kernel_size=1, stride=1, padding=0),
|
||||
nn.ReLU(True) if non_negative else nn.Identity(),
|
||||
nn.Identity(),
|
||||
)
|
||||
|
||||
if path:
|
||||
self.load(path)
|
||||
|
||||
|
||||
def forward(self, x):
|
||||
"""Forward pass.
|
||||
|
||||
Args:
|
||||
x (tensor): input data (image)
|
||||
|
||||
Returns:
|
||||
tensor: depth
|
||||
"""
|
||||
if self.channels_last==True:
|
||||
print("self.channels_last = ", self.channels_last)
|
||||
x.contiguous(memory_format=torch.channels_last)
|
||||
|
||||
|
||||
layer_1 = self.pretrained.layer1(x)
|
||||
layer_2 = self.pretrained.layer2(layer_1)
|
||||
layer_3 = self.pretrained.layer3(layer_2)
|
||||
layer_4 = self.pretrained.layer4(layer_3)
|
||||
|
||||
layer_1_rn = self.scratch.layer1_rn(layer_1)
|
||||
layer_2_rn = self.scratch.layer2_rn(layer_2)
|
||||
layer_3_rn = self.scratch.layer3_rn(layer_3)
|
||||
layer_4_rn = self.scratch.layer4_rn(layer_4)
|
||||
|
||||
|
||||
path_4 = self.scratch.refinenet4(layer_4_rn)
|
||||
path_3 = self.scratch.refinenet3(path_4, layer_3_rn)
|
||||
path_2 = self.scratch.refinenet2(path_3, layer_2_rn)
|
||||
path_1 = self.scratch.refinenet1(path_2, layer_1_rn)
|
||||
|
||||
out = self.scratch.output_conv(path_1)
|
||||
|
||||
return torch.squeeze(out, dim=1)
|
||||
|
||||
|
||||
|
||||
def fuse_model(m):
|
||||
prev_previous_type = nn.Identity()
|
||||
prev_previous_name = ''
|
||||
previous_type = nn.Identity()
|
||||
previous_name = ''
|
||||
for name, module in m.named_modules():
|
||||
if prev_previous_type == nn.Conv2d and previous_type == nn.BatchNorm2d and type(module) == nn.ReLU:
|
||||
# print("FUSED ", prev_previous_name, previous_name, name)
|
||||
torch.quantization.fuse_modules(m, [prev_previous_name, previous_name, name], inplace=True)
|
||||
elif prev_previous_type == nn.Conv2d and previous_type == nn.BatchNorm2d:
|
||||
# print("FUSED ", prev_previous_name, previous_name)
|
||||
torch.quantization.fuse_modules(m, [prev_previous_name, previous_name], inplace=True)
|
||||
# elif previous_type == nn.Conv2d and type(module) == nn.ReLU:
|
||||
# print("FUSED ", previous_name, name)
|
||||
# torch.quantization.fuse_modules(m, [previous_name, name], inplace=True)
|
||||
|
||||
prev_previous_type = previous_type
|
||||
prev_previous_name = previous_name
|
||||
previous_type = type(module)
|
||||
previous_name = name
|
||||
@@ -0,0 +1,234 @@
|
||||
import numpy as np
|
||||
import cv2
|
||||
import math
|
||||
|
||||
|
||||
def apply_min_size(sample, size, image_interpolation_method=cv2.INTER_AREA):
|
||||
"""Rezise the sample to ensure the given size. Keeps aspect ratio.
|
||||
|
||||
Args:
|
||||
sample (dict): sample
|
||||
size (tuple): image size
|
||||
|
||||
Returns:
|
||||
tuple: new size
|
||||
"""
|
||||
shape = list(sample["disparity"].shape)
|
||||
|
||||
if shape[0] >= size[0] and shape[1] >= size[1]:
|
||||
return sample
|
||||
|
||||
scale = [0, 0]
|
||||
scale[0] = size[0] / shape[0]
|
||||
scale[1] = size[1] / shape[1]
|
||||
|
||||
scale = max(scale)
|
||||
|
||||
shape[0] = math.ceil(scale * shape[0])
|
||||
shape[1] = math.ceil(scale * shape[1])
|
||||
|
||||
# resize
|
||||
sample["image"] = cv2.resize(
|
||||
sample["image"], tuple(shape[::-1]), interpolation=image_interpolation_method
|
||||
)
|
||||
|
||||
sample["disparity"] = cv2.resize(
|
||||
sample["disparity"], tuple(shape[::-1]), interpolation=cv2.INTER_NEAREST
|
||||
)
|
||||
sample["mask"] = cv2.resize(
|
||||
sample["mask"].astype(np.float32),
|
||||
tuple(shape[::-1]),
|
||||
interpolation=cv2.INTER_NEAREST,
|
||||
)
|
||||
sample["mask"] = sample["mask"].astype(bool)
|
||||
|
||||
return tuple(shape)
|
||||
|
||||
|
||||
class Resize(object):
|
||||
"""Resize sample to given size (width, height).
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
width,
|
||||
height,
|
||||
resize_target=True,
|
||||
keep_aspect_ratio=False,
|
||||
ensure_multiple_of=1,
|
||||
resize_method="lower_bound",
|
||||
image_interpolation_method=cv2.INTER_AREA,
|
||||
):
|
||||
"""Init.
|
||||
|
||||
Args:
|
||||
width (int): desired output width
|
||||
height (int): desired output height
|
||||
resize_target (bool, optional):
|
||||
True: Resize the full sample (image, mask, target).
|
||||
False: Resize image only.
|
||||
Defaults to True.
|
||||
keep_aspect_ratio (bool, optional):
|
||||
True: Keep the aspect ratio of the input sample.
|
||||
Output sample might not have the given width and height, and
|
||||
resize behaviour depends on the parameter 'resize_method'.
|
||||
Defaults to False.
|
||||
ensure_multiple_of (int, optional):
|
||||
Output width and height is constrained to be multiple of this parameter.
|
||||
Defaults to 1.
|
||||
resize_method (str, optional):
|
||||
"lower_bound": Output will be at least as large as the given size.
|
||||
"upper_bound": Output will be at max as large as the given size. (Output size might be smaller than given size.)
|
||||
"minimal": Scale as least as possible. (Output size might be smaller than given size.)
|
||||
Defaults to "lower_bound".
|
||||
"""
|
||||
self.__width = width
|
||||
self.__height = height
|
||||
|
||||
self.__resize_target = resize_target
|
||||
self.__keep_aspect_ratio = keep_aspect_ratio
|
||||
self.__multiple_of = ensure_multiple_of
|
||||
self.__resize_method = resize_method
|
||||
self.__image_interpolation_method = image_interpolation_method
|
||||
|
||||
def constrain_to_multiple_of(self, x, min_val=0, max_val=None):
|
||||
y = (np.round(x / self.__multiple_of) * self.__multiple_of).astype(int)
|
||||
|
||||
if max_val is not None and y > max_val:
|
||||
y = (np.floor(x / self.__multiple_of) * self.__multiple_of).astype(int)
|
||||
|
||||
if y < min_val:
|
||||
y = (np.ceil(x / self.__multiple_of) * self.__multiple_of).astype(int)
|
||||
|
||||
return y
|
||||
|
||||
def get_size(self, width, height):
|
||||
# determine new height and width
|
||||
scale_height = self.__height / height
|
||||
scale_width = self.__width / width
|
||||
|
||||
if self.__keep_aspect_ratio:
|
||||
if self.__resize_method == "lower_bound":
|
||||
# scale such that output size is lower bound
|
||||
if scale_width > scale_height:
|
||||
# fit width
|
||||
scale_height = scale_width
|
||||
else:
|
||||
# fit height
|
||||
scale_width = scale_height
|
||||
elif self.__resize_method == "upper_bound":
|
||||
# scale such that output size is upper bound
|
||||
if scale_width < scale_height:
|
||||
# fit width
|
||||
scale_height = scale_width
|
||||
else:
|
||||
# fit height
|
||||
scale_width = scale_height
|
||||
elif self.__resize_method == "minimal":
|
||||
# scale as least as possbile
|
||||
if abs(1 - scale_width) < abs(1 - scale_height):
|
||||
# fit width
|
||||
scale_height = scale_width
|
||||
else:
|
||||
# fit height
|
||||
scale_width = scale_height
|
||||
else:
|
||||
raise ValueError(
|
||||
f"resize_method {self.__resize_method} not implemented"
|
||||
)
|
||||
|
||||
if self.__resize_method == "lower_bound":
|
||||
new_height = self.constrain_to_multiple_of(
|
||||
scale_height * height, min_val=self.__height
|
||||
)
|
||||
new_width = self.constrain_to_multiple_of(
|
||||
scale_width * width, min_val=self.__width
|
||||
)
|
||||
elif self.__resize_method == "upper_bound":
|
||||
new_height = self.constrain_to_multiple_of(
|
||||
scale_height * height, max_val=self.__height
|
||||
)
|
||||
new_width = self.constrain_to_multiple_of(
|
||||
scale_width * width, max_val=self.__width
|
||||
)
|
||||
elif self.__resize_method == "minimal":
|
||||
new_height = self.constrain_to_multiple_of(scale_height * height)
|
||||
new_width = self.constrain_to_multiple_of(scale_width * width)
|
||||
else:
|
||||
raise ValueError(f"resize_method {self.__resize_method} not implemented")
|
||||
|
||||
return (new_width, new_height)
|
||||
|
||||
def __call__(self, sample):
|
||||
width, height = self.get_size(
|
||||
sample["image"].shape[1], sample["image"].shape[0]
|
||||
)
|
||||
|
||||
# resize sample
|
||||
sample["image"] = cv2.resize(
|
||||
sample["image"],
|
||||
(width, height),
|
||||
interpolation=self.__image_interpolation_method,
|
||||
)
|
||||
|
||||
if self.__resize_target:
|
||||
if "disparity" in sample:
|
||||
sample["disparity"] = cv2.resize(
|
||||
sample["disparity"],
|
||||
(width, height),
|
||||
interpolation=cv2.INTER_NEAREST,
|
||||
)
|
||||
|
||||
if "depth" in sample:
|
||||
sample["depth"] = cv2.resize(
|
||||
sample["depth"], (width, height), interpolation=cv2.INTER_NEAREST
|
||||
)
|
||||
|
||||
sample["mask"] = cv2.resize(
|
||||
sample["mask"].astype(np.float32),
|
||||
(width, height),
|
||||
interpolation=cv2.INTER_NEAREST,
|
||||
)
|
||||
sample["mask"] = sample["mask"].astype(bool)
|
||||
|
||||
return sample
|
||||
|
||||
|
||||
class NormalizeImage(object):
|
||||
"""Normlize image by given mean and std.
|
||||
"""
|
||||
|
||||
def __init__(self, mean, std):
|
||||
self.__mean = mean
|
||||
self.__std = std
|
||||
|
||||
def __call__(self, sample):
|
||||
sample["image"] = (sample["image"] - self.__mean) / self.__std
|
||||
|
||||
return sample
|
||||
|
||||
|
||||
class PrepareForNet(object):
|
||||
"""Prepare sample for usage as network input.
|
||||
"""
|
||||
|
||||
def __init__(self):
|
||||
pass
|
||||
|
||||
def __call__(self, sample):
|
||||
image = np.transpose(sample["image"], (2, 0, 1))
|
||||
sample["image"] = np.ascontiguousarray(image).astype(np.float32)
|
||||
|
||||
if "mask" in sample:
|
||||
sample["mask"] = sample["mask"].astype(np.float32)
|
||||
sample["mask"] = np.ascontiguousarray(sample["mask"])
|
||||
|
||||
if "disparity" in sample:
|
||||
disparity = sample["disparity"].astype(np.float32)
|
||||
sample["disparity"] = np.ascontiguousarray(disparity)
|
||||
|
||||
if "depth" in sample:
|
||||
depth = sample["depth"].astype(np.float32)
|
||||
sample["depth"] = np.ascontiguousarray(depth)
|
||||
|
||||
return sample
|
||||
491
extensions-builtin/forge_legacy_preprocessors/annotator/midas/midas/vit.py
Executable file
491
extensions-builtin/forge_legacy_preprocessors/annotator/midas/midas/vit.py
Executable file
@@ -0,0 +1,491 @@
|
||||
import torch
|
||||
import torch.nn as nn
|
||||
import timm
|
||||
import types
|
||||
import math
|
||||
import torch.nn.functional as F
|
||||
|
||||
|
||||
class Slice(nn.Module):
|
||||
def __init__(self, start_index=1):
|
||||
super(Slice, self).__init__()
|
||||
self.start_index = start_index
|
||||
|
||||
def forward(self, x):
|
||||
return x[:, self.start_index :]
|
||||
|
||||
|
||||
class AddReadout(nn.Module):
|
||||
def __init__(self, start_index=1):
|
||||
super(AddReadout, self).__init__()
|
||||
self.start_index = start_index
|
||||
|
||||
def forward(self, x):
|
||||
if self.start_index == 2:
|
||||
readout = (x[:, 0] + x[:, 1]) / 2
|
||||
else:
|
||||
readout = x[:, 0]
|
||||
return x[:, self.start_index :] + readout.unsqueeze(1)
|
||||
|
||||
|
||||
class ProjectReadout(nn.Module):
|
||||
def __init__(self, in_features, start_index=1):
|
||||
super(ProjectReadout, self).__init__()
|
||||
self.start_index = start_index
|
||||
|
||||
self.project = nn.Sequential(nn.Linear(2 * in_features, in_features), nn.GELU())
|
||||
|
||||
def forward(self, x):
|
||||
readout = x[:, 0].unsqueeze(1).expand_as(x[:, self.start_index :])
|
||||
features = torch.cat((x[:, self.start_index :], readout), -1)
|
||||
|
||||
return self.project(features)
|
||||
|
||||
|
||||
class Transpose(nn.Module):
|
||||
def __init__(self, dim0, dim1):
|
||||
super(Transpose, self).__init__()
|
||||
self.dim0 = dim0
|
||||
self.dim1 = dim1
|
||||
|
||||
def forward(self, x):
|
||||
x = x.transpose(self.dim0, self.dim1)
|
||||
return x
|
||||
|
||||
|
||||
def forward_vit(pretrained, x):
|
||||
b, c, h, w = x.shape
|
||||
|
||||
glob = pretrained.model.forward_flex(x)
|
||||
|
||||
layer_1 = pretrained.activations["1"]
|
||||
layer_2 = pretrained.activations["2"]
|
||||
layer_3 = pretrained.activations["3"]
|
||||
layer_4 = pretrained.activations["4"]
|
||||
|
||||
layer_1 = pretrained.act_postprocess1[0:2](layer_1)
|
||||
layer_2 = pretrained.act_postprocess2[0:2](layer_2)
|
||||
layer_3 = pretrained.act_postprocess3[0:2](layer_3)
|
||||
layer_4 = pretrained.act_postprocess4[0:2](layer_4)
|
||||
|
||||
unflatten = nn.Sequential(
|
||||
nn.Unflatten(
|
||||
2,
|
||||
torch.Size(
|
||||
[
|
||||
h // pretrained.model.patch_size[1],
|
||||
w // pretrained.model.patch_size[0],
|
||||
]
|
||||
),
|
||||
)
|
||||
)
|
||||
|
||||
if layer_1.ndim == 3:
|
||||
layer_1 = unflatten(layer_1)
|
||||
if layer_2.ndim == 3:
|
||||
layer_2 = unflatten(layer_2)
|
||||
if layer_3.ndim == 3:
|
||||
layer_3 = unflatten(layer_3)
|
||||
if layer_4.ndim == 3:
|
||||
layer_4 = unflatten(layer_4)
|
||||
|
||||
layer_1 = pretrained.act_postprocess1[3 : len(pretrained.act_postprocess1)](layer_1)
|
||||
layer_2 = pretrained.act_postprocess2[3 : len(pretrained.act_postprocess2)](layer_2)
|
||||
layer_3 = pretrained.act_postprocess3[3 : len(pretrained.act_postprocess3)](layer_3)
|
||||
layer_4 = pretrained.act_postprocess4[3 : len(pretrained.act_postprocess4)](layer_4)
|
||||
|
||||
return layer_1, layer_2, layer_3, layer_4
|
||||
|
||||
|
||||
def _resize_pos_embed(self, posemb, gs_h, gs_w):
|
||||
posemb_tok, posemb_grid = (
|
||||
posemb[:, : self.start_index],
|
||||
posemb[0, self.start_index :],
|
||||
)
|
||||
|
||||
gs_old = int(math.sqrt(len(posemb_grid)))
|
||||
|
||||
posemb_grid = posemb_grid.reshape(1, gs_old, gs_old, -1).permute(0, 3, 1, 2)
|
||||
posemb_grid = F.interpolate(posemb_grid, size=(gs_h, gs_w), mode="bilinear")
|
||||
posemb_grid = posemb_grid.permute(0, 2, 3, 1).reshape(1, gs_h * gs_w, -1)
|
||||
|
||||
posemb = torch.cat([posemb_tok, posemb_grid], dim=1)
|
||||
|
||||
return posemb
|
||||
|
||||
|
||||
def forward_flex(self, x):
|
||||
b, c, h, w = x.shape
|
||||
|
||||
pos_embed = self._resize_pos_embed(
|
||||
self.pos_embed, h // self.patch_size[1], w // self.patch_size[0]
|
||||
)
|
||||
|
||||
B = x.shape[0]
|
||||
|
||||
if hasattr(self.patch_embed, "backbone"):
|
||||
x = self.patch_embed.backbone(x)
|
||||
if isinstance(x, (list, tuple)):
|
||||
x = x[-1] # last feature if backbone outputs list/tuple of features
|
||||
|
||||
x = self.patch_embed.proj(x).flatten(2).transpose(1, 2)
|
||||
|
||||
if getattr(self, "dist_token", None) is not None:
|
||||
cls_tokens = self.cls_token.expand(
|
||||
B, -1, -1
|
||||
) # stole cls_tokens impl from Phil Wang, thanks
|
||||
dist_token = self.dist_token.expand(B, -1, -1)
|
||||
x = torch.cat((cls_tokens, dist_token, x), dim=1)
|
||||
else:
|
||||
cls_tokens = self.cls_token.expand(
|
||||
B, -1, -1
|
||||
) # stole cls_tokens impl from Phil Wang, thanks
|
||||
x = torch.cat((cls_tokens, x), dim=1)
|
||||
|
||||
x = x + pos_embed
|
||||
x = self.pos_drop(x)
|
||||
|
||||
for blk in self.blocks:
|
||||
x = blk(x)
|
||||
|
||||
x = self.norm(x)
|
||||
|
||||
return x
|
||||
|
||||
|
||||
activations = {}
|
||||
|
||||
|
||||
def get_activation(name):
|
||||
def hook(model, input, output):
|
||||
activations[name] = output
|
||||
|
||||
return hook
|
||||
|
||||
|
||||
def get_readout_oper(vit_features, features, use_readout, start_index=1):
|
||||
if use_readout == "ignore":
|
||||
readout_oper = [Slice(start_index)] * len(features)
|
||||
elif use_readout == "add":
|
||||
readout_oper = [AddReadout(start_index)] * len(features)
|
||||
elif use_readout == "project":
|
||||
readout_oper = [
|
||||
ProjectReadout(vit_features, start_index) for out_feat in features
|
||||
]
|
||||
else:
|
||||
assert (
|
||||
False
|
||||
), "wrong operation for readout token, use_readout can be 'ignore', 'add', or 'project'"
|
||||
|
||||
return readout_oper
|
||||
|
||||
|
||||
def _make_vit_b16_backbone(
|
||||
model,
|
||||
features=[96, 192, 384, 768],
|
||||
size=[384, 384],
|
||||
hooks=[2, 5, 8, 11],
|
||||
vit_features=768,
|
||||
use_readout="ignore",
|
||||
start_index=1,
|
||||
):
|
||||
pretrained = nn.Module()
|
||||
|
||||
pretrained.model = model
|
||||
pretrained.model.blocks[hooks[0]].register_forward_hook(get_activation("1"))
|
||||
pretrained.model.blocks[hooks[1]].register_forward_hook(get_activation("2"))
|
||||
pretrained.model.blocks[hooks[2]].register_forward_hook(get_activation("3"))
|
||||
pretrained.model.blocks[hooks[3]].register_forward_hook(get_activation("4"))
|
||||
|
||||
pretrained.activations = activations
|
||||
|
||||
readout_oper = get_readout_oper(vit_features, features, use_readout, start_index)
|
||||
|
||||
# 32, 48, 136, 384
|
||||
pretrained.act_postprocess1 = nn.Sequential(
|
||||
readout_oper[0],
|
||||
Transpose(1, 2),
|
||||
nn.Unflatten(2, torch.Size([size[0] // 16, size[1] // 16])),
|
||||
nn.Conv2d(
|
||||
in_channels=vit_features,
|
||||
out_channels=features[0],
|
||||
kernel_size=1,
|
||||
stride=1,
|
||||
padding=0,
|
||||
),
|
||||
nn.ConvTranspose2d(
|
||||
in_channels=features[0],
|
||||
out_channels=features[0],
|
||||
kernel_size=4,
|
||||
stride=4,
|
||||
padding=0,
|
||||
bias=True,
|
||||
dilation=1,
|
||||
groups=1,
|
||||
),
|
||||
)
|
||||
|
||||
pretrained.act_postprocess2 = nn.Sequential(
|
||||
readout_oper[1],
|
||||
Transpose(1, 2),
|
||||
nn.Unflatten(2, torch.Size([size[0] // 16, size[1] // 16])),
|
||||
nn.Conv2d(
|
||||
in_channels=vit_features,
|
||||
out_channels=features[1],
|
||||
kernel_size=1,
|
||||
stride=1,
|
||||
padding=0,
|
||||
),
|
||||
nn.ConvTranspose2d(
|
||||
in_channels=features[1],
|
||||
out_channels=features[1],
|
||||
kernel_size=2,
|
||||
stride=2,
|
||||
padding=0,
|
||||
bias=True,
|
||||
dilation=1,
|
||||
groups=1,
|
||||
),
|
||||
)
|
||||
|
||||
pretrained.act_postprocess3 = nn.Sequential(
|
||||
readout_oper[2],
|
||||
Transpose(1, 2),
|
||||
nn.Unflatten(2, torch.Size([size[0] // 16, size[1] // 16])),
|
||||
nn.Conv2d(
|
||||
in_channels=vit_features,
|
||||
out_channels=features[2],
|
||||
kernel_size=1,
|
||||
stride=1,
|
||||
padding=0,
|
||||
),
|
||||
)
|
||||
|
||||
pretrained.act_postprocess4 = nn.Sequential(
|
||||
readout_oper[3],
|
||||
Transpose(1, 2),
|
||||
nn.Unflatten(2, torch.Size([size[0] // 16, size[1] // 16])),
|
||||
nn.Conv2d(
|
||||
in_channels=vit_features,
|
||||
out_channels=features[3],
|
||||
kernel_size=1,
|
||||
stride=1,
|
||||
padding=0,
|
||||
),
|
||||
nn.Conv2d(
|
||||
in_channels=features[3],
|
||||
out_channels=features[3],
|
||||
kernel_size=3,
|
||||
stride=2,
|
||||
padding=1,
|
||||
),
|
||||
)
|
||||
|
||||
pretrained.model.start_index = start_index
|
||||
pretrained.model.patch_size = [16, 16]
|
||||
|
||||
# We inject this function into the VisionTransformer instances so that
|
||||
# we can use it with interpolated position embeddings without modifying the library source.
|
||||
pretrained.model.forward_flex = types.MethodType(forward_flex, pretrained.model)
|
||||
pretrained.model._resize_pos_embed = types.MethodType(
|
||||
_resize_pos_embed, pretrained.model
|
||||
)
|
||||
|
||||
return pretrained
|
||||
|
||||
|
||||
def _make_pretrained_vitl16_384(pretrained, use_readout="ignore", hooks=None):
|
||||
model = timm.create_model("vit_large_patch16_384", pretrained=pretrained)
|
||||
|
||||
hooks = [5, 11, 17, 23] if hooks == None else hooks
|
||||
return _make_vit_b16_backbone(
|
||||
model,
|
||||
features=[256, 512, 1024, 1024],
|
||||
hooks=hooks,
|
||||
vit_features=1024,
|
||||
use_readout=use_readout,
|
||||
)
|
||||
|
||||
|
||||
def _make_pretrained_vitb16_384(pretrained, use_readout="ignore", hooks=None):
|
||||
model = timm.create_model("vit_base_patch16_384", pretrained=pretrained)
|
||||
|
||||
hooks = [2, 5, 8, 11] if hooks == None else hooks
|
||||
return _make_vit_b16_backbone(
|
||||
model, features=[96, 192, 384, 768], hooks=hooks, use_readout=use_readout
|
||||
)
|
||||
|
||||
|
||||
def _make_pretrained_deitb16_384(pretrained, use_readout="ignore", hooks=None):
|
||||
model = timm.create_model("vit_deit_base_patch16_384", pretrained=pretrained)
|
||||
|
||||
hooks = [2, 5, 8, 11] if hooks == None else hooks
|
||||
return _make_vit_b16_backbone(
|
||||
model, features=[96, 192, 384, 768], hooks=hooks, use_readout=use_readout
|
||||
)
|
||||
|
||||
|
||||
def _make_pretrained_deitb16_distil_384(pretrained, use_readout="ignore", hooks=None):
|
||||
model = timm.create_model(
|
||||
"vit_deit_base_distilled_patch16_384", pretrained=pretrained
|
||||
)
|
||||
|
||||
hooks = [2, 5, 8, 11] if hooks == None else hooks
|
||||
return _make_vit_b16_backbone(
|
||||
model,
|
||||
features=[96, 192, 384, 768],
|
||||
hooks=hooks,
|
||||
use_readout=use_readout,
|
||||
start_index=2,
|
||||
)
|
||||
|
||||
|
||||
def _make_vit_b_rn50_backbone(
|
||||
model,
|
||||
features=[256, 512, 768, 768],
|
||||
size=[384, 384],
|
||||
hooks=[0, 1, 8, 11],
|
||||
vit_features=768,
|
||||
use_vit_only=False,
|
||||
use_readout="ignore",
|
||||
start_index=1,
|
||||
):
|
||||
pretrained = nn.Module()
|
||||
|
||||
pretrained.model = model
|
||||
|
||||
if use_vit_only == True:
|
||||
pretrained.model.blocks[hooks[0]].register_forward_hook(get_activation("1"))
|
||||
pretrained.model.blocks[hooks[1]].register_forward_hook(get_activation("2"))
|
||||
else:
|
||||
pretrained.model.patch_embed.backbone.stages[0].register_forward_hook(
|
||||
get_activation("1")
|
||||
)
|
||||
pretrained.model.patch_embed.backbone.stages[1].register_forward_hook(
|
||||
get_activation("2")
|
||||
)
|
||||
|
||||
pretrained.model.blocks[hooks[2]].register_forward_hook(get_activation("3"))
|
||||
pretrained.model.blocks[hooks[3]].register_forward_hook(get_activation("4"))
|
||||
|
||||
pretrained.activations = activations
|
||||
|
||||
readout_oper = get_readout_oper(vit_features, features, use_readout, start_index)
|
||||
|
||||
if use_vit_only == True:
|
||||
pretrained.act_postprocess1 = nn.Sequential(
|
||||
readout_oper[0],
|
||||
Transpose(1, 2),
|
||||
nn.Unflatten(2, torch.Size([size[0] // 16, size[1] // 16])),
|
||||
nn.Conv2d(
|
||||
in_channels=vit_features,
|
||||
out_channels=features[0],
|
||||
kernel_size=1,
|
||||
stride=1,
|
||||
padding=0,
|
||||
),
|
||||
nn.ConvTranspose2d(
|
||||
in_channels=features[0],
|
||||
out_channels=features[0],
|
||||
kernel_size=4,
|
||||
stride=4,
|
||||
padding=0,
|
||||
bias=True,
|
||||
dilation=1,
|
||||
groups=1,
|
||||
),
|
||||
)
|
||||
|
||||
pretrained.act_postprocess2 = nn.Sequential(
|
||||
readout_oper[1],
|
||||
Transpose(1, 2),
|
||||
nn.Unflatten(2, torch.Size([size[0] // 16, size[1] // 16])),
|
||||
nn.Conv2d(
|
||||
in_channels=vit_features,
|
||||
out_channels=features[1],
|
||||
kernel_size=1,
|
||||
stride=1,
|
||||
padding=0,
|
||||
),
|
||||
nn.ConvTranspose2d(
|
||||
in_channels=features[1],
|
||||
out_channels=features[1],
|
||||
kernel_size=2,
|
||||
stride=2,
|
||||
padding=0,
|
||||
bias=True,
|
||||
dilation=1,
|
||||
groups=1,
|
||||
),
|
||||
)
|
||||
else:
|
||||
pretrained.act_postprocess1 = nn.Sequential(
|
||||
nn.Identity(), nn.Identity(), nn.Identity()
|
||||
)
|
||||
pretrained.act_postprocess2 = nn.Sequential(
|
||||
nn.Identity(), nn.Identity(), nn.Identity()
|
||||
)
|
||||
|
||||
pretrained.act_postprocess3 = nn.Sequential(
|
||||
readout_oper[2],
|
||||
Transpose(1, 2),
|
||||
nn.Unflatten(2, torch.Size([size[0] // 16, size[1] // 16])),
|
||||
nn.Conv2d(
|
||||
in_channels=vit_features,
|
||||
out_channels=features[2],
|
||||
kernel_size=1,
|
||||
stride=1,
|
||||
padding=0,
|
||||
),
|
||||
)
|
||||
|
||||
pretrained.act_postprocess4 = nn.Sequential(
|
||||
readout_oper[3],
|
||||
Transpose(1, 2),
|
||||
nn.Unflatten(2, torch.Size([size[0] // 16, size[1] // 16])),
|
||||
nn.Conv2d(
|
||||
in_channels=vit_features,
|
||||
out_channels=features[3],
|
||||
kernel_size=1,
|
||||
stride=1,
|
||||
padding=0,
|
||||
),
|
||||
nn.Conv2d(
|
||||
in_channels=features[3],
|
||||
out_channels=features[3],
|
||||
kernel_size=3,
|
||||
stride=2,
|
||||
padding=1,
|
||||
),
|
||||
)
|
||||
|
||||
pretrained.model.start_index = start_index
|
||||
pretrained.model.patch_size = [16, 16]
|
||||
|
||||
# We inject this function into the VisionTransformer instances so that
|
||||
# we can use it with interpolated position embeddings without modifying the library source.
|
||||
pretrained.model.forward_flex = types.MethodType(forward_flex, pretrained.model)
|
||||
|
||||
# We inject this function into the VisionTransformer instances so that
|
||||
# we can use it with interpolated position embeddings without modifying the library source.
|
||||
pretrained.model._resize_pos_embed = types.MethodType(
|
||||
_resize_pos_embed, pretrained.model
|
||||
)
|
||||
|
||||
return pretrained
|
||||
|
||||
|
||||
def _make_pretrained_vitb_rn50_384(
|
||||
pretrained, use_readout="ignore", hooks=None, use_vit_only=False
|
||||
):
|
||||
model = timm.create_model("vit_base_resnet50_384", pretrained=pretrained)
|
||||
|
||||
hooks = [0, 1, 8, 11] if hooks == None else hooks
|
||||
return _make_vit_b_rn50_backbone(
|
||||
model,
|
||||
features=[256, 512, 768, 768],
|
||||
size=[384, 384],
|
||||
hooks=hooks,
|
||||
use_vit_only=use_vit_only,
|
||||
use_readout=use_readout,
|
||||
)
|
||||
189
extensions-builtin/forge_legacy_preprocessors/annotator/midas/utils.py
Executable file
189
extensions-builtin/forge_legacy_preprocessors/annotator/midas/utils.py
Executable file
@@ -0,0 +1,189 @@
|
||||
"""Utils for monoDepth."""
|
||||
import sys
|
||||
import re
|
||||
import numpy as np
|
||||
import cv2
|
||||
import torch
|
||||
|
||||
|
||||
def read_pfm(path):
|
||||
"""Read pfm file.
|
||||
|
||||
Args:
|
||||
path (str): path to file
|
||||
|
||||
Returns:
|
||||
tuple: (data, scale)
|
||||
"""
|
||||
with open(path, "rb") as file:
|
||||
|
||||
color = None
|
||||
width = None
|
||||
height = None
|
||||
scale = None
|
||||
endian = None
|
||||
|
||||
header = file.readline().rstrip()
|
||||
if header.decode("ascii") == "PF":
|
||||
color = True
|
||||
elif header.decode("ascii") == "Pf":
|
||||
color = False
|
||||
else:
|
||||
raise Exception("Not a PFM file: " + path)
|
||||
|
||||
dim_match = re.match(r"^(\d+)\s(\d+)\s$", file.readline().decode("ascii"))
|
||||
if dim_match:
|
||||
width, height = list(map(int, dim_match.groups()))
|
||||
else:
|
||||
raise Exception("Malformed PFM header.")
|
||||
|
||||
scale = float(file.readline().decode("ascii").rstrip())
|
||||
if scale < 0:
|
||||
# little-endian
|
||||
endian = "<"
|
||||
scale = -scale
|
||||
else:
|
||||
# big-endian
|
||||
endian = ">"
|
||||
|
||||
data = np.fromfile(file, endian + "f")
|
||||
shape = (height, width, 3) if color else (height, width)
|
||||
|
||||
data = np.reshape(data, shape)
|
||||
data = np.flipud(data)
|
||||
|
||||
return data, scale
|
||||
|
||||
|
||||
def write_pfm(path, image, scale=1):
|
||||
"""Write pfm file.
|
||||
|
||||
Args:
|
||||
path (str): pathto file
|
||||
image (array): data
|
||||
scale (int, optional): Scale. Defaults to 1.
|
||||
"""
|
||||
|
||||
with open(path, "wb") as file:
|
||||
color = None
|
||||
|
||||
if image.dtype.name != "float32":
|
||||
raise Exception("Image dtype must be float32.")
|
||||
|
||||
image = np.flipud(image)
|
||||
|
||||
if len(image.shape) == 3 and image.shape[2] == 3: # color image
|
||||
color = True
|
||||
elif (
|
||||
len(image.shape) == 2 or len(image.shape) == 3 and image.shape[2] == 1
|
||||
): # greyscale
|
||||
color = False
|
||||
else:
|
||||
raise Exception("Image must have H x W x 3, H x W x 1 or H x W dimensions.")
|
||||
|
||||
file.write("PF\n" if color else "Pf\n".encode())
|
||||
file.write("%d %d\n".encode() % (image.shape[1], image.shape[0]))
|
||||
|
||||
endian = image.dtype.byteorder
|
||||
|
||||
if endian == "<" or endian == "=" and sys.byteorder == "little":
|
||||
scale = -scale
|
||||
|
||||
file.write("%f\n".encode() % scale)
|
||||
|
||||
image.tofile(file)
|
||||
|
||||
|
||||
def read_image(path):
|
||||
"""Read image and output RGB image (0-1).
|
||||
|
||||
Args:
|
||||
path (str): path to file
|
||||
|
||||
Returns:
|
||||
array: RGB image (0-1)
|
||||
"""
|
||||
img = cv2.imread(path)
|
||||
|
||||
if img.ndim == 2:
|
||||
img = cv2.cvtColor(img, cv2.COLOR_GRAY2BGR)
|
||||
|
||||
img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB) / 255.0
|
||||
|
||||
return img
|
||||
|
||||
|
||||
def resize_image(img):
|
||||
"""Resize image and make it fit for network.
|
||||
|
||||
Args:
|
||||
img (array): image
|
||||
|
||||
Returns:
|
||||
tensor: data ready for network
|
||||
"""
|
||||
height_orig = img.shape[0]
|
||||
width_orig = img.shape[1]
|
||||
|
||||
if width_orig > height_orig:
|
||||
scale = width_orig / 384
|
||||
else:
|
||||
scale = height_orig / 384
|
||||
|
||||
height = (np.ceil(height_orig / scale / 32) * 32).astype(int)
|
||||
width = (np.ceil(width_orig / scale / 32) * 32).astype(int)
|
||||
|
||||
img_resized = cv2.resize(img, (width, height), interpolation=cv2.INTER_AREA)
|
||||
|
||||
img_resized = (
|
||||
torch.from_numpy(np.transpose(img_resized, (2, 0, 1))).contiguous().float()
|
||||
)
|
||||
img_resized = img_resized.unsqueeze(0)
|
||||
|
||||
return img_resized
|
||||
|
||||
|
||||
def resize_depth(depth, width, height):
|
||||
"""Resize depth map and bring to CPU (numpy).
|
||||
|
||||
Args:
|
||||
depth (tensor): depth
|
||||
width (int): image width
|
||||
height (int): image height
|
||||
|
||||
Returns:
|
||||
array: processed depth
|
||||
"""
|
||||
depth = torch.squeeze(depth[0, :, :, :]).to("cpu")
|
||||
|
||||
depth_resized = cv2.resize(
|
||||
depth.numpy(), (width, height), interpolation=cv2.INTER_CUBIC
|
||||
)
|
||||
|
||||
return depth_resized
|
||||
|
||||
def write_depth(path, depth, bits=1):
|
||||
"""Write depth map to pfm and png file.
|
||||
|
||||
Args:
|
||||
path (str): filepath without extension
|
||||
depth (array): depth
|
||||
"""
|
||||
write_pfm(path + ".pfm", depth.astype(np.float32))
|
||||
|
||||
depth_min = depth.min()
|
||||
depth_max = depth.max()
|
||||
|
||||
max_val = (2**(8*bits))-1
|
||||
|
||||
if depth_max - depth_min > np.finfo("float").eps:
|
||||
out = max_val * (depth - depth_min) / (depth_max - depth_min)
|
||||
else:
|
||||
out = np.zeros(depth.shape, dtype=depth.type)
|
||||
|
||||
if bits == 1:
|
||||
cv2.imwrite(path + ".png", out.astype("uint8"))
|
||||
elif bits == 2:
|
||||
cv2.imwrite(path + ".png", out.astype("uint16"))
|
||||
|
||||
return
|
||||
201
extensions-builtin/forge_legacy_preprocessors/annotator/mlsd/LICENSE
Executable file
201
extensions-builtin/forge_legacy_preprocessors/annotator/mlsd/LICENSE
Executable file
@@ -0,0 +1,201 @@
|
||||
Apache License
|
||||
Version 2.0, January 2004
|
||||
http://www.apache.org/licenses/
|
||||
|
||||
TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
|
||||
|
||||
1. Definitions.
|
||||
|
||||
"License" shall mean the terms and conditions for use, reproduction,
|
||||
and distribution as defined by Sections 1 through 9 of this document.
|
||||
|
||||
"Licensor" shall mean the copyright owner or entity authorized by
|
||||
the copyright owner that is granting the License.
|
||||
|
||||
"Legal Entity" shall mean the union of the acting entity and all
|
||||
other entities that control, are controlled by, or are under common
|
||||
control with that entity. For the purposes of this definition,
|
||||
"control" means (i) the power, direct or indirect, to cause the
|
||||
direction or management of such entity, whether by contract or
|
||||
otherwise, or (ii) ownership of fifty percent (50%) or more of the
|
||||
outstanding shares, or (iii) beneficial ownership of such entity.
|
||||
|
||||
"You" (or "Your") shall mean an individual or Legal Entity
|
||||
exercising permissions granted by this License.
|
||||
|
||||
"Source" form shall mean the preferred form for making modifications,
|
||||
including but not limited to software source code, documentation
|
||||
source, and configuration files.
|
||||
|
||||
"Object" form shall mean any form resulting from mechanical
|
||||
transformation or translation of a Source form, including but
|
||||
not limited to compiled object code, generated documentation,
|
||||
and conversions to other media types.
|
||||
|
||||
"Work" shall mean the work of authorship, whether in Source or
|
||||
Object form, made available under the License, as indicated by a
|
||||
copyright notice that is included in or attached to the work
|
||||
(an example is provided in the Appendix below).
|
||||
|
||||
"Derivative Works" shall mean any work, whether in Source or Object
|
||||
form, that is based on (or derived from) the Work and for which the
|
||||
editorial revisions, annotations, elaborations, or other modifications
|
||||
represent, as a whole, an original work of authorship. For the purposes
|
||||
of this License, Derivative Works shall not include works that remain
|
||||
separable from, or merely link (or bind by name) to the interfaces of,
|
||||
the Work and Derivative Works thereof.
|
||||
|
||||
"Contribution" shall mean any work of authorship, including
|
||||
the original version of the Work and any modifications or additions
|
||||
to that Work or Derivative Works thereof, that is intentionally
|
||||
submitted to Licensor for inclusion in the Work by the copyright owner
|
||||
or by an individual or Legal Entity authorized to submit on behalf of
|
||||
the copyright owner. For the purposes of this definition, "submitted"
|
||||
means any form of electronic, verbal, or written communication sent
|
||||
to the Licensor or its representatives, including but not limited to
|
||||
communication on electronic mailing lists, source code control systems,
|
||||
and issue tracking systems that are managed by, or on behalf of, the
|
||||
Licensor for the purpose of discussing and improving the Work, but
|
||||
excluding communication that is conspicuously marked or otherwise
|
||||
designated in writing by the copyright owner as "Not a Contribution."
|
||||
|
||||
"Contributor" shall mean Licensor and any individual or Legal Entity
|
||||
on behalf of whom a Contribution has been received by Licensor and
|
||||
subsequently incorporated within the Work.
|
||||
|
||||
2. Grant of Copyright License. Subject to the terms and conditions of
|
||||
this License, each Contributor hereby grants to You a perpetual,
|
||||
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
|
||||
copyright license to reproduce, prepare Derivative Works of,
|
||||
publicly display, publicly perform, sublicense, and distribute the
|
||||
Work and such Derivative Works in Source or Object form.
|
||||
|
||||
3. Grant of Patent License. Subject to the terms and conditions of
|
||||
this License, each Contributor hereby grants to You a perpetual,
|
||||
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
|
||||
(except as stated in this section) patent license to make, have made,
|
||||
use, offer to sell, sell, import, and otherwise transfer the Work,
|
||||
where such license applies only to those patent claims licensable
|
||||
by such Contributor that are necessarily infringed by their
|
||||
Contribution(s) alone or by combination of their Contribution(s)
|
||||
with the Work to which such Contribution(s) was submitted. If You
|
||||
institute patent litigation against any entity (including a
|
||||
cross-claim or counterclaim in a lawsuit) alleging that the Work
|
||||
or a Contribution incorporated within the Work constitutes direct
|
||||
or contributory patent infringement, then any patent licenses
|
||||
granted to You under this License for that Work shall terminate
|
||||
as of the date such litigation is filed.
|
||||
|
||||
4. Redistribution. You may reproduce and distribute copies of the
|
||||
Work or Derivative Works thereof in any medium, with or without
|
||||
modifications, and in Source or Object form, provided that You
|
||||
meet the following conditions:
|
||||
|
||||
(a) You must give any other recipients of the Work or
|
||||
Derivative Works a copy of this License; and
|
||||
|
||||
(b) You must cause any modified files to carry prominent notices
|
||||
stating that You changed the files; and
|
||||
|
||||
(c) You must retain, in the Source form of any Derivative Works
|
||||
that You distribute, all copyright, patent, trademark, and
|
||||
attribution notices from the Source form of the Work,
|
||||
excluding those notices that do not pertain to any part of
|
||||
the Derivative Works; and
|
||||
|
||||
(d) If the Work includes a "NOTICE" text file as part of its
|
||||
distribution, then any Derivative Works that You distribute must
|
||||
include a readable copy of the attribution notices contained
|
||||
within such NOTICE file, excluding those notices that do not
|
||||
pertain to any part of the Derivative Works, in at least one
|
||||
of the following places: within a NOTICE text file distributed
|
||||
as part of the Derivative Works; within the Source form or
|
||||
documentation, if provided along with the Derivative Works; or,
|
||||
within a display generated by the Derivative Works, if and
|
||||
wherever such third-party notices normally appear. The contents
|
||||
of the NOTICE file are for informational purposes only and
|
||||
do not modify the License. You may add Your own attribution
|
||||
notices within Derivative Works that You distribute, alongside
|
||||
or as an addendum to the NOTICE text from the Work, provided
|
||||
that such additional attribution notices cannot be construed
|
||||
as modifying the License.
|
||||
|
||||
You may add Your own copyright statement to Your modifications and
|
||||
may provide additional or different license terms and conditions
|
||||
for use, reproduction, or distribution of Your modifications, or
|
||||
for any such Derivative Works as a whole, provided Your use,
|
||||
reproduction, and distribution of the Work otherwise complies with
|
||||
the conditions stated in this License.
|
||||
|
||||
5. Submission of Contributions. Unless You explicitly state otherwise,
|
||||
any Contribution intentionally submitted for inclusion in the Work
|
||||
by You to the Licensor shall be under the terms and conditions of
|
||||
this License, without any additional terms or conditions.
|
||||
Notwithstanding the above, nothing herein shall supersede or modify
|
||||
the terms of any separate license agreement you may have executed
|
||||
with Licensor regarding such Contributions.
|
||||
|
||||
6. Trademarks. This License does not grant permission to use the trade
|
||||
names, trademarks, service marks, or product names of the Licensor,
|
||||
except as required for reasonable and customary use in describing the
|
||||
origin of the Work and reproducing the content of the NOTICE file.
|
||||
|
||||
7. Disclaimer of Warranty. Unless required by applicable law or
|
||||
agreed to in writing, Licensor provides the Work (and each
|
||||
Contributor provides its Contributions) on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
|
||||
implied, including, without limitation, any warranties or conditions
|
||||
of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
|
||||
PARTICULAR PURPOSE. You are solely responsible for determining the
|
||||
appropriateness of using or redistributing the Work and assume any
|
||||
risks associated with Your exercise of permissions under this License.
|
||||
|
||||
8. Limitation of Liability. In no event and under no legal theory,
|
||||
whether in tort (including negligence), contract, or otherwise,
|
||||
unless required by applicable law (such as deliberate and grossly
|
||||
negligent acts) or agreed to in writing, shall any Contributor be
|
||||
liable to You for damages, including any direct, indirect, special,
|
||||
incidental, or consequential damages of any character arising as a
|
||||
result of this License or out of the use or inability to use the
|
||||
Work (including but not limited to damages for loss of goodwill,
|
||||
work stoppage, computer failure or malfunction, or any and all
|
||||
other commercial damages or losses), even if such Contributor
|
||||
has been advised of the possibility of such damages.
|
||||
|
||||
9. Accepting Warranty or Additional Liability. While redistributing
|
||||
the Work or Derivative Works thereof, You may choose to offer,
|
||||
and charge a fee for, acceptance of support, warranty, indemnity,
|
||||
or other liability obligations and/or rights consistent with this
|
||||
License. However, in accepting such obligations, You may act only
|
||||
on Your own behalf and on Your sole responsibility, not on behalf
|
||||
of any other Contributor, and only if You agree to indemnify,
|
||||
defend, and hold each Contributor harmless for any liability
|
||||
incurred by, or claims asserted against, such Contributor by reason
|
||||
of your accepting any such warranty or additional liability.
|
||||
|
||||
END OF TERMS AND CONDITIONS
|
||||
|
||||
APPENDIX: How to apply the Apache License to your work.
|
||||
|
||||
To apply the Apache License to your work, attach the following
|
||||
boilerplate notice, with the fields enclosed by brackets "{}"
|
||||
replaced with your own identifying information. (Don't include
|
||||
the brackets!) The text should be enclosed in the appropriate
|
||||
comment syntax for the file format. We also recommend that a
|
||||
file or class name and description of purpose be included on the
|
||||
same "printed page" as the copyright notice for easier
|
||||
identification within third-party archives.
|
||||
|
||||
Copyright 2021-present NAVER Corp.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
49
extensions-builtin/forge_legacy_preprocessors/annotator/mlsd/__init__.py
Executable file
49
extensions-builtin/forge_legacy_preprocessors/annotator/mlsd/__init__.py
Executable file
@@ -0,0 +1,49 @@
|
||||
import cv2
|
||||
import numpy as np
|
||||
import torch
|
||||
import os
|
||||
|
||||
from einops import rearrange
|
||||
from .models.mbv2_mlsd_tiny import MobileV2_MLSD_Tiny
|
||||
from .models.mbv2_mlsd_large import MobileV2_MLSD_Large
|
||||
from .utils import pred_lines
|
||||
from modules import devices
|
||||
from annotator.annotator_path import models_path
|
||||
|
||||
mlsdmodel = None
|
||||
remote_model_path = "https://huggingface.co/lllyasviel/ControlNet/resolve/main/annotator/ckpts/mlsd_large_512_fp32.pth"
|
||||
old_modeldir = os.path.dirname(os.path.realpath(__file__))
|
||||
modeldir = os.path.join(models_path, "mlsd")
|
||||
|
||||
def unload_mlsd_model():
|
||||
global mlsdmodel
|
||||
if mlsdmodel is not None:
|
||||
mlsdmodel = mlsdmodel.cpu()
|
||||
|
||||
def apply_mlsd(input_image, thr_v, thr_d):
|
||||
global modelpath, mlsdmodel
|
||||
if mlsdmodel is None:
|
||||
modelpath = os.path.join(modeldir, "mlsd_large_512_fp32.pth")
|
||||
old_modelpath = os.path.join(old_modeldir, "mlsd_large_512_fp32.pth")
|
||||
if os.path.exists(old_modelpath):
|
||||
modelpath = old_modelpath
|
||||
elif not os.path.exists(modelpath):
|
||||
from modules.modelloader import load_file_from_url
|
||||
load_file_from_url(remote_model_path, model_dir=modeldir)
|
||||
mlsdmodel = MobileV2_MLSD_Large()
|
||||
mlsdmodel.load_state_dict(torch.load(modelpath), strict=True)
|
||||
mlsdmodel = mlsdmodel.to(devices.get_device_for("controlnet")).eval()
|
||||
|
||||
model = mlsdmodel
|
||||
assert input_image.ndim == 3
|
||||
img = input_image
|
||||
img_output = np.zeros_like(img)
|
||||
try:
|
||||
with torch.no_grad():
|
||||
lines = pred_lines(img, model, [img.shape[0], img.shape[1]], thr_v, thr_d)
|
||||
for line in lines:
|
||||
x_start, y_start, x_end, y_end = [int(val) for val in line]
|
||||
cv2.line(img_output, (x_start, y_start), (x_end, y_end), [255, 255, 255], 1)
|
||||
except Exception as e:
|
||||
pass
|
||||
return img_output[:, :, 0]
|
||||
@@ -0,0 +1,292 @@
|
||||
import os
|
||||
import sys
|
||||
import torch
|
||||
import torch.nn as nn
|
||||
import torch.utils.model_zoo as model_zoo
|
||||
from torch.nn import functional as F
|
||||
|
||||
|
||||
class BlockTypeA(nn.Module):
|
||||
def __init__(self, in_c1, in_c2, out_c1, out_c2, upscale = True):
|
||||
super(BlockTypeA, self).__init__()
|
||||
self.conv1 = nn.Sequential(
|
||||
nn.Conv2d(in_c2, out_c2, kernel_size=1),
|
||||
nn.BatchNorm2d(out_c2),
|
||||
nn.ReLU(inplace=True)
|
||||
)
|
||||
self.conv2 = nn.Sequential(
|
||||
nn.Conv2d(in_c1, out_c1, kernel_size=1),
|
||||
nn.BatchNorm2d(out_c1),
|
||||
nn.ReLU(inplace=True)
|
||||
)
|
||||
self.upscale = upscale
|
||||
|
||||
def forward(self, a, b):
|
||||
b = self.conv1(b)
|
||||
a = self.conv2(a)
|
||||
if self.upscale:
|
||||
b = F.interpolate(b, scale_factor=2.0, mode='bilinear', align_corners=True)
|
||||
return torch.cat((a, b), dim=1)
|
||||
|
||||
|
||||
class BlockTypeB(nn.Module):
|
||||
def __init__(self, in_c, out_c):
|
||||
super(BlockTypeB, self).__init__()
|
||||
self.conv1 = nn.Sequential(
|
||||
nn.Conv2d(in_c, in_c, kernel_size=3, padding=1),
|
||||
nn.BatchNorm2d(in_c),
|
||||
nn.ReLU()
|
||||
)
|
||||
self.conv2 = nn.Sequential(
|
||||
nn.Conv2d(in_c, out_c, kernel_size=3, padding=1),
|
||||
nn.BatchNorm2d(out_c),
|
||||
nn.ReLU()
|
||||
)
|
||||
|
||||
def forward(self, x):
|
||||
x = self.conv1(x) + x
|
||||
x = self.conv2(x)
|
||||
return x
|
||||
|
||||
class BlockTypeC(nn.Module):
|
||||
def __init__(self, in_c, out_c):
|
||||
super(BlockTypeC, self).__init__()
|
||||
self.conv1 = nn.Sequential(
|
||||
nn.Conv2d(in_c, in_c, kernel_size=3, padding=5, dilation=5),
|
||||
nn.BatchNorm2d(in_c),
|
||||
nn.ReLU()
|
||||
)
|
||||
self.conv2 = nn.Sequential(
|
||||
nn.Conv2d(in_c, in_c, kernel_size=3, padding=1),
|
||||
nn.BatchNorm2d(in_c),
|
||||
nn.ReLU()
|
||||
)
|
||||
self.conv3 = nn.Conv2d(in_c, out_c, kernel_size=1)
|
||||
|
||||
def forward(self, x):
|
||||
x = self.conv1(x)
|
||||
x = self.conv2(x)
|
||||
x = self.conv3(x)
|
||||
return x
|
||||
|
||||
def _make_divisible(v, divisor, min_value=None):
|
||||
"""
|
||||
This function is taken from the original tf repo.
|
||||
It ensures that all layers have a channel number that is divisible by 8
|
||||
It can be seen here:
|
||||
https://github.com/tensorflow/models/blob/master/research/slim/nets/mobilenet/mobilenet.py
|
||||
:param v:
|
||||
:param divisor:
|
||||
:param min_value:
|
||||
:return:
|
||||
"""
|
||||
if min_value is None:
|
||||
min_value = divisor
|
||||
new_v = max(min_value, int(v + divisor / 2) // divisor * divisor)
|
||||
# Make sure that round down does not go down by more than 10%.
|
||||
if new_v < 0.9 * v:
|
||||
new_v += divisor
|
||||
return new_v
|
||||
|
||||
|
||||
class ConvBNReLU(nn.Sequential):
|
||||
def __init__(self, in_planes, out_planes, kernel_size=3, stride=1, groups=1):
|
||||
self.channel_pad = out_planes - in_planes
|
||||
self.stride = stride
|
||||
#padding = (kernel_size - 1) // 2
|
||||
|
||||
# TFLite uses slightly different padding than PyTorch
|
||||
if stride == 2:
|
||||
padding = 0
|
||||
else:
|
||||
padding = (kernel_size - 1) // 2
|
||||
|
||||
super(ConvBNReLU, self).__init__(
|
||||
nn.Conv2d(in_planes, out_planes, kernel_size, stride, padding, groups=groups, bias=False),
|
||||
nn.BatchNorm2d(out_planes),
|
||||
nn.ReLU6(inplace=True)
|
||||
)
|
||||
self.max_pool = nn.MaxPool2d(kernel_size=stride, stride=stride)
|
||||
|
||||
|
||||
def forward(self, x):
|
||||
# TFLite uses different padding
|
||||
if self.stride == 2:
|
||||
x = F.pad(x, (0, 1, 0, 1), "constant", 0)
|
||||
#print(x.shape)
|
||||
|
||||
for module in self:
|
||||
if not isinstance(module, nn.MaxPool2d):
|
||||
x = module(x)
|
||||
return x
|
||||
|
||||
|
||||
class InvertedResidual(nn.Module):
|
||||
def __init__(self, inp, oup, stride, expand_ratio):
|
||||
super(InvertedResidual, self).__init__()
|
||||
self.stride = stride
|
||||
assert stride in [1, 2]
|
||||
|
||||
hidden_dim = int(round(inp * expand_ratio))
|
||||
self.use_res_connect = self.stride == 1 and inp == oup
|
||||
|
||||
layers = []
|
||||
if expand_ratio != 1:
|
||||
# pw
|
||||
layers.append(ConvBNReLU(inp, hidden_dim, kernel_size=1))
|
||||
layers.extend([
|
||||
# dw
|
||||
ConvBNReLU(hidden_dim, hidden_dim, stride=stride, groups=hidden_dim),
|
||||
# pw-linear
|
||||
nn.Conv2d(hidden_dim, oup, 1, 1, 0, bias=False),
|
||||
nn.BatchNorm2d(oup),
|
||||
])
|
||||
self.conv = nn.Sequential(*layers)
|
||||
|
||||
def forward(self, x):
|
||||
if self.use_res_connect:
|
||||
return x + self.conv(x)
|
||||
else:
|
||||
return self.conv(x)
|
||||
|
||||
|
||||
class MobileNetV2(nn.Module):
|
||||
def __init__(self, pretrained=True):
|
||||
"""
|
||||
MobileNet V2 main class
|
||||
Args:
|
||||
num_classes (int): Number of classes
|
||||
width_mult (float): Width multiplier - adjusts number of channels in each layer by this amount
|
||||
inverted_residual_setting: Network structure
|
||||
round_nearest (int): Round the number of channels in each layer to be a multiple of this number
|
||||
Set to 1 to turn off rounding
|
||||
block: Module specifying inverted residual building block for mobilenet
|
||||
"""
|
||||
super(MobileNetV2, self).__init__()
|
||||
|
||||
block = InvertedResidual
|
||||
input_channel = 32
|
||||
last_channel = 1280
|
||||
width_mult = 1.0
|
||||
round_nearest = 8
|
||||
|
||||
inverted_residual_setting = [
|
||||
# t, c, n, s
|
||||
[1, 16, 1, 1],
|
||||
[6, 24, 2, 2],
|
||||
[6, 32, 3, 2],
|
||||
[6, 64, 4, 2],
|
||||
[6, 96, 3, 1],
|
||||
#[6, 160, 3, 2],
|
||||
#[6, 320, 1, 1],
|
||||
]
|
||||
|
||||
# only check the first element, assuming user knows t,c,n,s are required
|
||||
if len(inverted_residual_setting) == 0 or len(inverted_residual_setting[0]) != 4:
|
||||
raise ValueError("inverted_residual_setting should be non-empty "
|
||||
"or a 4-element list, got {}".format(inverted_residual_setting))
|
||||
|
||||
# building first layer
|
||||
input_channel = _make_divisible(input_channel * width_mult, round_nearest)
|
||||
self.last_channel = _make_divisible(last_channel * max(1.0, width_mult), round_nearest)
|
||||
features = [ConvBNReLU(4, input_channel, stride=2)]
|
||||
# building inverted residual blocks
|
||||
for t, c, n, s in inverted_residual_setting:
|
||||
output_channel = _make_divisible(c * width_mult, round_nearest)
|
||||
for i in range(n):
|
||||
stride = s if i == 0 else 1
|
||||
features.append(block(input_channel, output_channel, stride, expand_ratio=t))
|
||||
input_channel = output_channel
|
||||
|
||||
self.features = nn.Sequential(*features)
|
||||
self.fpn_selected = [1, 3, 6, 10, 13]
|
||||
# weight initialization
|
||||
for m in self.modules():
|
||||
if isinstance(m, nn.Conv2d):
|
||||
nn.init.kaiming_normal_(m.weight, mode='fan_out')
|
||||
if m.bias is not None:
|
||||
nn.init.zeros_(m.bias)
|
||||
elif isinstance(m, nn.BatchNorm2d):
|
||||
nn.init.ones_(m.weight)
|
||||
nn.init.zeros_(m.bias)
|
||||
elif isinstance(m, nn.Linear):
|
||||
nn.init.normal_(m.weight, 0, 0.01)
|
||||
nn.init.zeros_(m.bias)
|
||||
if pretrained:
|
||||
self._load_pretrained_model()
|
||||
|
||||
def _forward_impl(self, x):
|
||||
# This exists since TorchScript doesn't support inheritance, so the superclass method
|
||||
# (this one) needs to have a name other than `forward` that can be accessed in a subclass
|
||||
fpn_features = []
|
||||
for i, f in enumerate(self.features):
|
||||
if i > self.fpn_selected[-1]:
|
||||
break
|
||||
x = f(x)
|
||||
if i in self.fpn_selected:
|
||||
fpn_features.append(x)
|
||||
|
||||
c1, c2, c3, c4, c5 = fpn_features
|
||||
return c1, c2, c3, c4, c5
|
||||
|
||||
|
||||
def forward(self, x):
|
||||
return self._forward_impl(x)
|
||||
|
||||
def _load_pretrained_model(self):
|
||||
pretrain_dict = model_zoo.load_url('https://download.pytorch.org/models/mobilenet_v2-b0353104.pth')
|
||||
model_dict = {}
|
||||
state_dict = self.state_dict()
|
||||
for k, v in pretrain_dict.items():
|
||||
if k in state_dict:
|
||||
model_dict[k] = v
|
||||
state_dict.update(model_dict)
|
||||
self.load_state_dict(state_dict)
|
||||
|
||||
|
||||
class MobileV2_MLSD_Large(nn.Module):
|
||||
def __init__(self):
|
||||
super(MobileV2_MLSD_Large, self).__init__()
|
||||
|
||||
self.backbone = MobileNetV2(pretrained=False)
|
||||
## A, B
|
||||
self.block15 = BlockTypeA(in_c1= 64, in_c2= 96,
|
||||
out_c1= 64, out_c2=64,
|
||||
upscale=False)
|
||||
self.block16 = BlockTypeB(128, 64)
|
||||
|
||||
## A, B
|
||||
self.block17 = BlockTypeA(in_c1 = 32, in_c2 = 64,
|
||||
out_c1= 64, out_c2= 64)
|
||||
self.block18 = BlockTypeB(128, 64)
|
||||
|
||||
## A, B
|
||||
self.block19 = BlockTypeA(in_c1=24, in_c2=64,
|
||||
out_c1=64, out_c2=64)
|
||||
self.block20 = BlockTypeB(128, 64)
|
||||
|
||||
## A, B, C
|
||||
self.block21 = BlockTypeA(in_c1=16, in_c2=64,
|
||||
out_c1=64, out_c2=64)
|
||||
self.block22 = BlockTypeB(128, 64)
|
||||
|
||||
self.block23 = BlockTypeC(64, 16)
|
||||
|
||||
def forward(self, x):
|
||||
c1, c2, c3, c4, c5 = self.backbone(x)
|
||||
|
||||
x = self.block15(c4, c5)
|
||||
x = self.block16(x)
|
||||
|
||||
x = self.block17(c3, x)
|
||||
x = self.block18(x)
|
||||
|
||||
x = self.block19(c2, x)
|
||||
x = self.block20(x)
|
||||
|
||||
x = self.block21(c1, x)
|
||||
x = self.block22(x)
|
||||
x = self.block23(x)
|
||||
x = x[:, 7:, :, :]
|
||||
|
||||
return x
|
||||
@@ -0,0 +1,275 @@
|
||||
import os
|
||||
import sys
|
||||
import torch
|
||||
import torch.nn as nn
|
||||
import torch.utils.model_zoo as model_zoo
|
||||
from torch.nn import functional as F
|
||||
|
||||
|
||||
class BlockTypeA(nn.Module):
|
||||
def __init__(self, in_c1, in_c2, out_c1, out_c2, upscale = True):
|
||||
super(BlockTypeA, self).__init__()
|
||||
self.conv1 = nn.Sequential(
|
||||
nn.Conv2d(in_c2, out_c2, kernel_size=1),
|
||||
nn.BatchNorm2d(out_c2),
|
||||
nn.ReLU(inplace=True)
|
||||
)
|
||||
self.conv2 = nn.Sequential(
|
||||
nn.Conv2d(in_c1, out_c1, kernel_size=1),
|
||||
nn.BatchNorm2d(out_c1),
|
||||
nn.ReLU(inplace=True)
|
||||
)
|
||||
self.upscale = upscale
|
||||
|
||||
def forward(self, a, b):
|
||||
b = self.conv1(b)
|
||||
a = self.conv2(a)
|
||||
b = F.interpolate(b, scale_factor=2.0, mode='bilinear', align_corners=True)
|
||||
return torch.cat((a, b), dim=1)
|
||||
|
||||
|
||||
class BlockTypeB(nn.Module):
|
||||
def __init__(self, in_c, out_c):
|
||||
super(BlockTypeB, self).__init__()
|
||||
self.conv1 = nn.Sequential(
|
||||
nn.Conv2d(in_c, in_c, kernel_size=3, padding=1),
|
||||
nn.BatchNorm2d(in_c),
|
||||
nn.ReLU()
|
||||
)
|
||||
self.conv2 = nn.Sequential(
|
||||
nn.Conv2d(in_c, out_c, kernel_size=3, padding=1),
|
||||
nn.BatchNorm2d(out_c),
|
||||
nn.ReLU()
|
||||
)
|
||||
|
||||
def forward(self, x):
|
||||
x = self.conv1(x) + x
|
||||
x = self.conv2(x)
|
||||
return x
|
||||
|
||||
class BlockTypeC(nn.Module):
|
||||
def __init__(self, in_c, out_c):
|
||||
super(BlockTypeC, self).__init__()
|
||||
self.conv1 = nn.Sequential(
|
||||
nn.Conv2d(in_c, in_c, kernel_size=3, padding=5, dilation=5),
|
||||
nn.BatchNorm2d(in_c),
|
||||
nn.ReLU()
|
||||
)
|
||||
self.conv2 = nn.Sequential(
|
||||
nn.Conv2d(in_c, in_c, kernel_size=3, padding=1),
|
||||
nn.BatchNorm2d(in_c),
|
||||
nn.ReLU()
|
||||
)
|
||||
self.conv3 = nn.Conv2d(in_c, out_c, kernel_size=1)
|
||||
|
||||
def forward(self, x):
|
||||
x = self.conv1(x)
|
||||
x = self.conv2(x)
|
||||
x = self.conv3(x)
|
||||
return x
|
||||
|
||||
def _make_divisible(v, divisor, min_value=None):
|
||||
"""
|
||||
This function is taken from the original tf repo.
|
||||
It ensures that all layers have a channel number that is divisible by 8
|
||||
It can be seen here:
|
||||
https://github.com/tensorflow/models/blob/master/research/slim/nets/mobilenet/mobilenet.py
|
||||
:param v:
|
||||
:param divisor:
|
||||
:param min_value:
|
||||
:return:
|
||||
"""
|
||||
if min_value is None:
|
||||
min_value = divisor
|
||||
new_v = max(min_value, int(v + divisor / 2) // divisor * divisor)
|
||||
# Make sure that round down does not go down by more than 10%.
|
||||
if new_v < 0.9 * v:
|
||||
new_v += divisor
|
||||
return new_v
|
||||
|
||||
|
||||
class ConvBNReLU(nn.Sequential):
|
||||
def __init__(self, in_planes, out_planes, kernel_size=3, stride=1, groups=1):
|
||||
self.channel_pad = out_planes - in_planes
|
||||
self.stride = stride
|
||||
#padding = (kernel_size - 1) // 2
|
||||
|
||||
# TFLite uses slightly different padding than PyTorch
|
||||
if stride == 2:
|
||||
padding = 0
|
||||
else:
|
||||
padding = (kernel_size - 1) // 2
|
||||
|
||||
super(ConvBNReLU, self).__init__(
|
||||
nn.Conv2d(in_planes, out_planes, kernel_size, stride, padding, groups=groups, bias=False),
|
||||
nn.BatchNorm2d(out_planes),
|
||||
nn.ReLU6(inplace=True)
|
||||
)
|
||||
self.max_pool = nn.MaxPool2d(kernel_size=stride, stride=stride)
|
||||
|
||||
|
||||
def forward(self, x):
|
||||
# TFLite uses different padding
|
||||
if self.stride == 2:
|
||||
x = F.pad(x, (0, 1, 0, 1), "constant", 0)
|
||||
#print(x.shape)
|
||||
|
||||
for module in self:
|
||||
if not isinstance(module, nn.MaxPool2d):
|
||||
x = module(x)
|
||||
return x
|
||||
|
||||
|
||||
class InvertedResidual(nn.Module):
|
||||
def __init__(self, inp, oup, stride, expand_ratio):
|
||||
super(InvertedResidual, self).__init__()
|
||||
self.stride = stride
|
||||
assert stride in [1, 2]
|
||||
|
||||
hidden_dim = int(round(inp * expand_ratio))
|
||||
self.use_res_connect = self.stride == 1 and inp == oup
|
||||
|
||||
layers = []
|
||||
if expand_ratio != 1:
|
||||
# pw
|
||||
layers.append(ConvBNReLU(inp, hidden_dim, kernel_size=1))
|
||||
layers.extend([
|
||||
# dw
|
||||
ConvBNReLU(hidden_dim, hidden_dim, stride=stride, groups=hidden_dim),
|
||||
# pw-linear
|
||||
nn.Conv2d(hidden_dim, oup, 1, 1, 0, bias=False),
|
||||
nn.BatchNorm2d(oup),
|
||||
])
|
||||
self.conv = nn.Sequential(*layers)
|
||||
|
||||
def forward(self, x):
|
||||
if self.use_res_connect:
|
||||
return x + self.conv(x)
|
||||
else:
|
||||
return self.conv(x)
|
||||
|
||||
|
||||
class MobileNetV2(nn.Module):
|
||||
def __init__(self, pretrained=True):
|
||||
"""
|
||||
MobileNet V2 main class
|
||||
Args:
|
||||
num_classes (int): Number of classes
|
||||
width_mult (float): Width multiplier - adjusts number of channels in each layer by this amount
|
||||
inverted_residual_setting: Network structure
|
||||
round_nearest (int): Round the number of channels in each layer to be a multiple of this number
|
||||
Set to 1 to turn off rounding
|
||||
block: Module specifying inverted residual building block for mobilenet
|
||||
"""
|
||||
super(MobileNetV2, self).__init__()
|
||||
|
||||
block = InvertedResidual
|
||||
input_channel = 32
|
||||
last_channel = 1280
|
||||
width_mult = 1.0
|
||||
round_nearest = 8
|
||||
|
||||
inverted_residual_setting = [
|
||||
# t, c, n, s
|
||||
[1, 16, 1, 1],
|
||||
[6, 24, 2, 2],
|
||||
[6, 32, 3, 2],
|
||||
[6, 64, 4, 2],
|
||||
#[6, 96, 3, 1],
|
||||
#[6, 160, 3, 2],
|
||||
#[6, 320, 1, 1],
|
||||
]
|
||||
|
||||
# only check the first element, assuming user knows t,c,n,s are required
|
||||
if len(inverted_residual_setting) == 0 or len(inverted_residual_setting[0]) != 4:
|
||||
raise ValueError("inverted_residual_setting should be non-empty "
|
||||
"or a 4-element list, got {}".format(inverted_residual_setting))
|
||||
|
||||
# building first layer
|
||||
input_channel = _make_divisible(input_channel * width_mult, round_nearest)
|
||||
self.last_channel = _make_divisible(last_channel * max(1.0, width_mult), round_nearest)
|
||||
features = [ConvBNReLU(4, input_channel, stride=2)]
|
||||
# building inverted residual blocks
|
||||
for t, c, n, s in inverted_residual_setting:
|
||||
output_channel = _make_divisible(c * width_mult, round_nearest)
|
||||
for i in range(n):
|
||||
stride = s if i == 0 else 1
|
||||
features.append(block(input_channel, output_channel, stride, expand_ratio=t))
|
||||
input_channel = output_channel
|
||||
self.features = nn.Sequential(*features)
|
||||
|
||||
self.fpn_selected = [3, 6, 10]
|
||||
# weight initialization
|
||||
for m in self.modules():
|
||||
if isinstance(m, nn.Conv2d):
|
||||
nn.init.kaiming_normal_(m.weight, mode='fan_out')
|
||||
if m.bias is not None:
|
||||
nn.init.zeros_(m.bias)
|
||||
elif isinstance(m, nn.BatchNorm2d):
|
||||
nn.init.ones_(m.weight)
|
||||
nn.init.zeros_(m.bias)
|
||||
elif isinstance(m, nn.Linear):
|
||||
nn.init.normal_(m.weight, 0, 0.01)
|
||||
nn.init.zeros_(m.bias)
|
||||
|
||||
#if pretrained:
|
||||
# self._load_pretrained_model()
|
||||
|
||||
def _forward_impl(self, x):
|
||||
# This exists since TorchScript doesn't support inheritance, so the superclass method
|
||||
# (this one) needs to have a name other than `forward` that can be accessed in a subclass
|
||||
fpn_features = []
|
||||
for i, f in enumerate(self.features):
|
||||
if i > self.fpn_selected[-1]:
|
||||
break
|
||||
x = f(x)
|
||||
if i in self.fpn_selected:
|
||||
fpn_features.append(x)
|
||||
|
||||
c2, c3, c4 = fpn_features
|
||||
return c2, c3, c4
|
||||
|
||||
|
||||
def forward(self, x):
|
||||
return self._forward_impl(x)
|
||||
|
||||
def _load_pretrained_model(self):
|
||||
pretrain_dict = model_zoo.load_url('https://download.pytorch.org/models/mobilenet_v2-b0353104.pth')
|
||||
model_dict = {}
|
||||
state_dict = self.state_dict()
|
||||
for k, v in pretrain_dict.items():
|
||||
if k in state_dict:
|
||||
model_dict[k] = v
|
||||
state_dict.update(model_dict)
|
||||
self.load_state_dict(state_dict)
|
||||
|
||||
|
||||
class MobileV2_MLSD_Tiny(nn.Module):
|
||||
def __init__(self):
|
||||
super(MobileV2_MLSD_Tiny, self).__init__()
|
||||
|
||||
self.backbone = MobileNetV2(pretrained=True)
|
||||
|
||||
self.block12 = BlockTypeA(in_c1= 32, in_c2= 64,
|
||||
out_c1= 64, out_c2=64)
|
||||
self.block13 = BlockTypeB(128, 64)
|
||||
|
||||
self.block14 = BlockTypeA(in_c1 = 24, in_c2 = 64,
|
||||
out_c1= 32, out_c2= 32)
|
||||
self.block15 = BlockTypeB(64, 64)
|
||||
|
||||
self.block16 = BlockTypeC(64, 16)
|
||||
|
||||
def forward(self, x):
|
||||
c2, c3, c4 = self.backbone(x)
|
||||
|
||||
x = self.block12(c3, c4)
|
||||
x = self.block13(x)
|
||||
x = self.block14(c2, x)
|
||||
x = self.block15(x)
|
||||
x = self.block16(x)
|
||||
x = x[:, 7:, :, :]
|
||||
#print(x.shape)
|
||||
x = F.interpolate(x, scale_factor=2.0, mode='bilinear', align_corners=True)
|
||||
|
||||
return x
|
||||
581
extensions-builtin/forge_legacy_preprocessors/annotator/mlsd/utils.py
Executable file
581
extensions-builtin/forge_legacy_preprocessors/annotator/mlsd/utils.py
Executable file
@@ -0,0 +1,581 @@
|
||||
'''
|
||||
modified by lihaoweicv
|
||||
pytorch version
|
||||
'''
|
||||
|
||||
'''
|
||||
M-LSD
|
||||
Copyright 2021-present NAVER Corp.
|
||||
Apache License v2.0
|
||||
'''
|
||||
|
||||
import os
|
||||
import numpy as np
|
||||
import cv2
|
||||
import torch
|
||||
from torch.nn import functional as F
|
||||
from modules import devices
|
||||
|
||||
|
||||
def deccode_output_score_and_ptss(tpMap, topk_n = 200, ksize = 5):
|
||||
'''
|
||||
tpMap:
|
||||
center: tpMap[1, 0, :, :]
|
||||
displacement: tpMap[1, 1:5, :, :]
|
||||
'''
|
||||
b, c, h, w = tpMap.shape
|
||||
assert b==1, 'only support bsize==1'
|
||||
displacement = tpMap[:, 1:5, :, :][0]
|
||||
center = tpMap[:, 0, :, :]
|
||||
heat = torch.sigmoid(center)
|
||||
hmax = F.max_pool2d( heat, (ksize, ksize), stride=1, padding=(ksize-1)//2)
|
||||
keep = (hmax == heat).float()
|
||||
heat = heat * keep
|
||||
heat = heat.reshape(-1, )
|
||||
|
||||
scores, indices = torch.topk(heat, topk_n, dim=-1, largest=True)
|
||||
yy = torch.floor_divide(indices, w).unsqueeze(-1)
|
||||
xx = torch.fmod(indices, w).unsqueeze(-1)
|
||||
ptss = torch.cat((yy, xx),dim=-1)
|
||||
|
||||
ptss = ptss.detach().cpu().numpy()
|
||||
scores = scores.detach().cpu().numpy()
|
||||
displacement = displacement.detach().cpu().numpy()
|
||||
displacement = displacement.transpose((1,2,0))
|
||||
return ptss, scores, displacement
|
||||
|
||||
|
||||
def pred_lines(image, model,
|
||||
input_shape=[512, 512],
|
||||
score_thr=0.10,
|
||||
dist_thr=20.0):
|
||||
h, w, _ = image.shape
|
||||
h_ratio, w_ratio = [h / input_shape[0], w / input_shape[1]]
|
||||
|
||||
resized_image = np.concatenate([cv2.resize(image, (input_shape[1], input_shape[0]), interpolation=cv2.INTER_AREA),
|
||||
np.ones([input_shape[0], input_shape[1], 1])], axis=-1)
|
||||
|
||||
resized_image = resized_image.transpose((2,0,1))
|
||||
batch_image = np.expand_dims(resized_image, axis=0).astype('float32')
|
||||
batch_image = (batch_image / 127.5) - 1.0
|
||||
|
||||
batch_image = torch.from_numpy(batch_image).float().to(devices.get_device_for("controlnet"))
|
||||
outputs = model(batch_image)
|
||||
pts, pts_score, vmap = deccode_output_score_and_ptss(outputs, 200, 3)
|
||||
start = vmap[:, :, :2]
|
||||
end = vmap[:, :, 2:]
|
||||
dist_map = np.sqrt(np.sum((start - end) ** 2, axis=-1))
|
||||
|
||||
segments_list = []
|
||||
for center, score in zip(pts, pts_score):
|
||||
y, x = center
|
||||
distance = dist_map[y, x]
|
||||
if score > score_thr and distance > dist_thr:
|
||||
disp_x_start, disp_y_start, disp_x_end, disp_y_end = vmap[y, x, :]
|
||||
x_start = x + disp_x_start
|
||||
y_start = y + disp_y_start
|
||||
x_end = x + disp_x_end
|
||||
y_end = y + disp_y_end
|
||||
segments_list.append([x_start, y_start, x_end, y_end])
|
||||
|
||||
lines = 2 * np.array(segments_list) # 256 > 512
|
||||
lines[:, 0] = lines[:, 0] * w_ratio
|
||||
lines[:, 1] = lines[:, 1] * h_ratio
|
||||
lines[:, 2] = lines[:, 2] * w_ratio
|
||||
lines[:, 3] = lines[:, 3] * h_ratio
|
||||
|
||||
return lines
|
||||
|
||||
|
||||
def pred_squares(image,
|
||||
model,
|
||||
input_shape=[512, 512],
|
||||
params={'score': 0.06,
|
||||
'outside_ratio': 0.28,
|
||||
'inside_ratio': 0.45,
|
||||
'w_overlap': 0.0,
|
||||
'w_degree': 1.95,
|
||||
'w_length': 0.0,
|
||||
'w_area': 1.86,
|
||||
'w_center': 0.14}):
|
||||
'''
|
||||
shape = [height, width]
|
||||
'''
|
||||
h, w, _ = image.shape
|
||||
original_shape = [h, w]
|
||||
|
||||
resized_image = np.concatenate([cv2.resize(image, (input_shape[0], input_shape[1]), interpolation=cv2.INTER_AREA),
|
||||
np.ones([input_shape[0], input_shape[1], 1])], axis=-1)
|
||||
resized_image = resized_image.transpose((2, 0, 1))
|
||||
batch_image = np.expand_dims(resized_image, axis=0).astype('float32')
|
||||
batch_image = (batch_image / 127.5) - 1.0
|
||||
|
||||
batch_image = torch.from_numpy(batch_image).float().to(devices.get_device_for("controlnet"))
|
||||
outputs = model(batch_image)
|
||||
|
||||
pts, pts_score, vmap = deccode_output_score_and_ptss(outputs, 200, 3)
|
||||
start = vmap[:, :, :2] # (x, y)
|
||||
end = vmap[:, :, 2:] # (x, y)
|
||||
dist_map = np.sqrt(np.sum((start - end) ** 2, axis=-1))
|
||||
|
||||
junc_list = []
|
||||
segments_list = []
|
||||
for junc, score in zip(pts, pts_score):
|
||||
y, x = junc
|
||||
distance = dist_map[y, x]
|
||||
if score > params['score'] and distance > 20.0:
|
||||
junc_list.append([x, y])
|
||||
disp_x_start, disp_y_start, disp_x_end, disp_y_end = vmap[y, x, :]
|
||||
d_arrow = 1.0
|
||||
x_start = x + d_arrow * disp_x_start
|
||||
y_start = y + d_arrow * disp_y_start
|
||||
x_end = x + d_arrow * disp_x_end
|
||||
y_end = y + d_arrow * disp_y_end
|
||||
segments_list.append([x_start, y_start, x_end, y_end])
|
||||
|
||||
segments = np.array(segments_list)
|
||||
|
||||
####### post processing for squares
|
||||
# 1. get unique lines
|
||||
point = np.array([[0, 0]])
|
||||
point = point[0]
|
||||
start = segments[:, :2]
|
||||
end = segments[:, 2:]
|
||||
diff = start - end
|
||||
a = diff[:, 1]
|
||||
b = -diff[:, 0]
|
||||
c = a * start[:, 0] + b * start[:, 1]
|
||||
|
||||
d = np.abs(a * point[0] + b * point[1] - c) / np.sqrt(a ** 2 + b ** 2 + 1e-10)
|
||||
theta = np.arctan2(diff[:, 0], diff[:, 1]) * 180 / np.pi
|
||||
theta[theta < 0.0] += 180
|
||||
hough = np.concatenate([d[:, None], theta[:, None]], axis=-1)
|
||||
|
||||
d_quant = 1
|
||||
theta_quant = 2
|
||||
hough[:, 0] //= d_quant
|
||||
hough[:, 1] //= theta_quant
|
||||
_, indices, counts = np.unique(hough, axis=0, return_index=True, return_counts=True)
|
||||
|
||||
acc_map = np.zeros([512 // d_quant + 1, 360 // theta_quant + 1], dtype='float32')
|
||||
idx_map = np.zeros([512 // d_quant + 1, 360 // theta_quant + 1], dtype='int32') - 1
|
||||
yx_indices = hough[indices, :].astype('int32')
|
||||
acc_map[yx_indices[:, 0], yx_indices[:, 1]] = counts
|
||||
idx_map[yx_indices[:, 0], yx_indices[:, 1]] = indices
|
||||
|
||||
acc_map_np = acc_map
|
||||
# acc_map = acc_map[None, :, :, None]
|
||||
#
|
||||
# ### fast suppression using tensorflow op
|
||||
# acc_map = tf.constant(acc_map, dtype=tf.float32)
|
||||
# max_acc_map = tf.keras.layers.MaxPool2D(pool_size=(5, 5), strides=1, padding='same')(acc_map)
|
||||
# acc_map = acc_map * tf.cast(tf.math.equal(acc_map, max_acc_map), tf.float32)
|
||||
# flatten_acc_map = tf.reshape(acc_map, [1, -1])
|
||||
# topk_values, topk_indices = tf.math.top_k(flatten_acc_map, k=len(pts))
|
||||
# _, h, w, _ = acc_map.shape
|
||||
# y = tf.expand_dims(topk_indices // w, axis=-1)
|
||||
# x = tf.expand_dims(topk_indices % w, axis=-1)
|
||||
# yx = tf.concat([y, x], axis=-1)
|
||||
|
||||
### fast suppression using pytorch op
|
||||
acc_map = torch.from_numpy(acc_map_np).unsqueeze(0).unsqueeze(0)
|
||||
_,_, h, w = acc_map.shape
|
||||
max_acc_map = F.max_pool2d(acc_map,kernel_size=5, stride=1, padding=2)
|
||||
acc_map = acc_map * ( (acc_map == max_acc_map).float() )
|
||||
flatten_acc_map = acc_map.reshape([-1, ])
|
||||
|
||||
scores, indices = torch.topk(flatten_acc_map, len(pts), dim=-1, largest=True)
|
||||
yy = torch.div(indices, w, rounding_mode='floor').unsqueeze(-1)
|
||||
xx = torch.fmod(indices, w).unsqueeze(-1)
|
||||
yx = torch.cat((yy, xx), dim=-1)
|
||||
|
||||
yx = yx.detach().cpu().numpy()
|
||||
|
||||
topk_values = scores.detach().cpu().numpy()
|
||||
indices = idx_map[yx[:, 0], yx[:, 1]]
|
||||
basis = 5 // 2
|
||||
|
||||
merged_segments = []
|
||||
for yx_pt, max_indice, value in zip(yx, indices, topk_values):
|
||||
y, x = yx_pt
|
||||
if max_indice == -1 or value == 0:
|
||||
continue
|
||||
segment_list = []
|
||||
for y_offset in range(-basis, basis + 1):
|
||||
for x_offset in range(-basis, basis + 1):
|
||||
indice = idx_map[y + y_offset, x + x_offset]
|
||||
cnt = int(acc_map_np[y + y_offset, x + x_offset])
|
||||
if indice != -1:
|
||||
segment_list.append(segments[indice])
|
||||
if cnt > 1:
|
||||
check_cnt = 1
|
||||
current_hough = hough[indice]
|
||||
for new_indice, new_hough in enumerate(hough):
|
||||
if (current_hough == new_hough).all() and indice != new_indice:
|
||||
segment_list.append(segments[new_indice])
|
||||
check_cnt += 1
|
||||
if check_cnt == cnt:
|
||||
break
|
||||
group_segments = np.array(segment_list).reshape([-1, 2])
|
||||
sorted_group_segments = np.sort(group_segments, axis=0)
|
||||
x_min, y_min = sorted_group_segments[0, :]
|
||||
x_max, y_max = sorted_group_segments[-1, :]
|
||||
|
||||
deg = theta[max_indice]
|
||||
if deg >= 90:
|
||||
merged_segments.append([x_min, y_max, x_max, y_min])
|
||||
else:
|
||||
merged_segments.append([x_min, y_min, x_max, y_max])
|
||||
|
||||
# 2. get intersections
|
||||
new_segments = np.array(merged_segments) # (x1, y1, x2, y2)
|
||||
start = new_segments[:, :2] # (x1, y1)
|
||||
end = new_segments[:, 2:] # (x2, y2)
|
||||
new_centers = (start + end) / 2.0
|
||||
diff = start - end
|
||||
dist_segments = np.sqrt(np.sum(diff ** 2, axis=-1))
|
||||
|
||||
# ax + by = c
|
||||
a = diff[:, 1]
|
||||
b = -diff[:, 0]
|
||||
c = a * start[:, 0] + b * start[:, 1]
|
||||
pre_det = a[:, None] * b[None, :]
|
||||
det = pre_det - np.transpose(pre_det)
|
||||
|
||||
pre_inter_y = a[:, None] * c[None, :]
|
||||
inter_y = (pre_inter_y - np.transpose(pre_inter_y)) / (det + 1e-10)
|
||||
pre_inter_x = c[:, None] * b[None, :]
|
||||
inter_x = (pre_inter_x - np.transpose(pre_inter_x)) / (det + 1e-10)
|
||||
inter_pts = np.concatenate([inter_x[:, :, None], inter_y[:, :, None]], axis=-1).astype('int32')
|
||||
|
||||
# 3. get corner information
|
||||
# 3.1 get distance
|
||||
'''
|
||||
dist_segments:
|
||||
| dist(0), dist(1), dist(2), ...|
|
||||
dist_inter_to_segment1:
|
||||
| dist(inter,0), dist(inter,0), dist(inter,0), ... |
|
||||
| dist(inter,1), dist(inter,1), dist(inter,1), ... |
|
||||
...
|
||||
dist_inter_to_semgnet2:
|
||||
| dist(inter,0), dist(inter,1), dist(inter,2), ... |
|
||||
| dist(inter,0), dist(inter,1), dist(inter,2), ... |
|
||||
...
|
||||
'''
|
||||
|
||||
dist_inter_to_segment1_start = np.sqrt(
|
||||
np.sum(((inter_pts - start[:, None, :]) ** 2), axis=-1, keepdims=True)) # [n_batch, n_batch, 1]
|
||||
dist_inter_to_segment1_end = np.sqrt(
|
||||
np.sum(((inter_pts - end[:, None, :]) ** 2), axis=-1, keepdims=True)) # [n_batch, n_batch, 1]
|
||||
dist_inter_to_segment2_start = np.sqrt(
|
||||
np.sum(((inter_pts - start[None, :, :]) ** 2), axis=-1, keepdims=True)) # [n_batch, n_batch, 1]
|
||||
dist_inter_to_segment2_end = np.sqrt(
|
||||
np.sum(((inter_pts - end[None, :, :]) ** 2), axis=-1, keepdims=True)) # [n_batch, n_batch, 1]
|
||||
|
||||
# sort ascending
|
||||
dist_inter_to_segment1 = np.sort(
|
||||
np.concatenate([dist_inter_to_segment1_start, dist_inter_to_segment1_end], axis=-1),
|
||||
axis=-1) # [n_batch, n_batch, 2]
|
||||
dist_inter_to_segment2 = np.sort(
|
||||
np.concatenate([dist_inter_to_segment2_start, dist_inter_to_segment2_end], axis=-1),
|
||||
axis=-1) # [n_batch, n_batch, 2]
|
||||
|
||||
# 3.2 get degree
|
||||
inter_to_start = new_centers[:, None, :] - inter_pts
|
||||
deg_inter_to_start = np.arctan2(inter_to_start[:, :, 1], inter_to_start[:, :, 0]) * 180 / np.pi
|
||||
deg_inter_to_start[deg_inter_to_start < 0.0] += 360
|
||||
inter_to_end = new_centers[None, :, :] - inter_pts
|
||||
deg_inter_to_end = np.arctan2(inter_to_end[:, :, 1], inter_to_end[:, :, 0]) * 180 / np.pi
|
||||
deg_inter_to_end[deg_inter_to_end < 0.0] += 360
|
||||
|
||||
'''
|
||||
B -- G
|
||||
| |
|
||||
C -- R
|
||||
B : blue / G: green / C: cyan / R: red
|
||||
|
||||
0 -- 1
|
||||
| |
|
||||
3 -- 2
|
||||
'''
|
||||
# rename variables
|
||||
deg1_map, deg2_map = deg_inter_to_start, deg_inter_to_end
|
||||
# sort deg ascending
|
||||
deg_sort = np.sort(np.concatenate([deg1_map[:, :, None], deg2_map[:, :, None]], axis=-1), axis=-1)
|
||||
|
||||
deg_diff_map = np.abs(deg1_map - deg2_map)
|
||||
# we only consider the smallest degree of intersect
|
||||
deg_diff_map[deg_diff_map > 180] = 360 - deg_diff_map[deg_diff_map > 180]
|
||||
|
||||
# define available degree range
|
||||
deg_range = [60, 120]
|
||||
|
||||
corner_dict = {corner_info: [] for corner_info in range(4)}
|
||||
inter_points = []
|
||||
for i in range(inter_pts.shape[0]):
|
||||
for j in range(i + 1, inter_pts.shape[1]):
|
||||
# i, j > line index, always i < j
|
||||
x, y = inter_pts[i, j, :]
|
||||
deg1, deg2 = deg_sort[i, j, :]
|
||||
deg_diff = deg_diff_map[i, j]
|
||||
|
||||
check_degree = deg_diff > deg_range[0] and deg_diff < deg_range[1]
|
||||
|
||||
outside_ratio = params['outside_ratio'] # over ratio >>> drop it!
|
||||
inside_ratio = params['inside_ratio'] # over ratio >>> drop it!
|
||||
check_distance = ((dist_inter_to_segment1[i, j, 1] >= dist_segments[i] and \
|
||||
dist_inter_to_segment1[i, j, 0] <= dist_segments[i] * outside_ratio) or \
|
||||
(dist_inter_to_segment1[i, j, 1] <= dist_segments[i] and \
|
||||
dist_inter_to_segment1[i, j, 0] <= dist_segments[i] * inside_ratio)) and \
|
||||
((dist_inter_to_segment2[i, j, 1] >= dist_segments[j] and \
|
||||
dist_inter_to_segment2[i, j, 0] <= dist_segments[j] * outside_ratio) or \
|
||||
(dist_inter_to_segment2[i, j, 1] <= dist_segments[j] and \
|
||||
dist_inter_to_segment2[i, j, 0] <= dist_segments[j] * inside_ratio))
|
||||
|
||||
if check_degree and check_distance:
|
||||
corner_info = None
|
||||
|
||||
if (deg1 >= 0 and deg1 <= 45 and deg2 >= 45 and deg2 <= 120) or \
|
||||
(deg2 >= 315 and deg1 >= 45 and deg1 <= 120):
|
||||
corner_info, color_info = 0, 'blue'
|
||||
elif (deg1 >= 45 and deg1 <= 125 and deg2 >= 125 and deg2 <= 225):
|
||||
corner_info, color_info = 1, 'green'
|
||||
elif (deg1 >= 125 and deg1 <= 225 and deg2 >= 225 and deg2 <= 315):
|
||||
corner_info, color_info = 2, 'black'
|
||||
elif (deg1 >= 0 and deg1 <= 45 and deg2 >= 225 and deg2 <= 315) or \
|
||||
(deg2 >= 315 and deg1 >= 225 and deg1 <= 315):
|
||||
corner_info, color_info = 3, 'cyan'
|
||||
else:
|
||||
corner_info, color_info = 4, 'red' # we don't use it
|
||||
continue
|
||||
|
||||
corner_dict[corner_info].append([x, y, i, j])
|
||||
inter_points.append([x, y])
|
||||
|
||||
square_list = []
|
||||
connect_list = []
|
||||
segments_list = []
|
||||
for corner0 in corner_dict[0]:
|
||||
for corner1 in corner_dict[1]:
|
||||
connect01 = False
|
||||
for corner0_line in corner0[2:]:
|
||||
if corner0_line in corner1[2:]:
|
||||
connect01 = True
|
||||
break
|
||||
if connect01:
|
||||
for corner2 in corner_dict[2]:
|
||||
connect12 = False
|
||||
for corner1_line in corner1[2:]:
|
||||
if corner1_line in corner2[2:]:
|
||||
connect12 = True
|
||||
break
|
||||
if connect12:
|
||||
for corner3 in corner_dict[3]:
|
||||
connect23 = False
|
||||
for corner2_line in corner2[2:]:
|
||||
if corner2_line in corner3[2:]:
|
||||
connect23 = True
|
||||
break
|
||||
if connect23:
|
||||
for corner3_line in corner3[2:]:
|
||||
if corner3_line in corner0[2:]:
|
||||
# SQUARE!!!
|
||||
'''
|
||||
0 -- 1
|
||||
| |
|
||||
3 -- 2
|
||||
square_list:
|
||||
order: 0 > 1 > 2 > 3
|
||||
| x0, y0, x1, y1, x2, y2, x3, y3 |
|
||||
| x0, y0, x1, y1, x2, y2, x3, y3 |
|
||||
...
|
||||
connect_list:
|
||||
order: 01 > 12 > 23 > 30
|
||||
| line_idx01, line_idx12, line_idx23, line_idx30 |
|
||||
| line_idx01, line_idx12, line_idx23, line_idx30 |
|
||||
...
|
||||
segments_list:
|
||||
order: 0 > 1 > 2 > 3
|
||||
| line_idx0_i, line_idx0_j, line_idx1_i, line_idx1_j, line_idx2_i, line_idx2_j, line_idx3_i, line_idx3_j |
|
||||
| line_idx0_i, line_idx0_j, line_idx1_i, line_idx1_j, line_idx2_i, line_idx2_j, line_idx3_i, line_idx3_j |
|
||||
...
|
||||
'''
|
||||
square_list.append(corner0[:2] + corner1[:2] + corner2[:2] + corner3[:2])
|
||||
connect_list.append([corner0_line, corner1_line, corner2_line, corner3_line])
|
||||
segments_list.append(corner0[2:] + corner1[2:] + corner2[2:] + corner3[2:])
|
||||
|
||||
def check_outside_inside(segments_info, connect_idx):
|
||||
# return 'outside or inside', min distance, cover_param, peri_param
|
||||
if connect_idx == segments_info[0]:
|
||||
check_dist_mat = dist_inter_to_segment1
|
||||
else:
|
||||
check_dist_mat = dist_inter_to_segment2
|
||||
|
||||
i, j = segments_info
|
||||
min_dist, max_dist = check_dist_mat[i, j, :]
|
||||
connect_dist = dist_segments[connect_idx]
|
||||
if max_dist > connect_dist:
|
||||
return 'outside', min_dist, 0, 1
|
||||
else:
|
||||
return 'inside', min_dist, -1, -1
|
||||
|
||||
top_square = None
|
||||
|
||||
try:
|
||||
map_size = input_shape[0] / 2
|
||||
squares = np.array(square_list).reshape([-1, 4, 2])
|
||||
score_array = []
|
||||
connect_array = np.array(connect_list)
|
||||
segments_array = np.array(segments_list).reshape([-1, 4, 2])
|
||||
|
||||
# get degree of corners:
|
||||
squares_rollup = np.roll(squares, 1, axis=1)
|
||||
squares_rolldown = np.roll(squares, -1, axis=1)
|
||||
vec1 = squares_rollup - squares
|
||||
normalized_vec1 = vec1 / (np.linalg.norm(vec1, axis=-1, keepdims=True) + 1e-10)
|
||||
vec2 = squares_rolldown - squares
|
||||
normalized_vec2 = vec2 / (np.linalg.norm(vec2, axis=-1, keepdims=True) + 1e-10)
|
||||
inner_products = np.sum(normalized_vec1 * normalized_vec2, axis=-1) # [n_squares, 4]
|
||||
squares_degree = np.arccos(inner_products) * 180 / np.pi # [n_squares, 4]
|
||||
|
||||
# get square score
|
||||
overlap_scores = []
|
||||
degree_scores = []
|
||||
length_scores = []
|
||||
|
||||
for connects, segments, square, degree in zip(connect_array, segments_array, squares, squares_degree):
|
||||
'''
|
||||
0 -- 1
|
||||
| |
|
||||
3 -- 2
|
||||
|
||||
# segments: [4, 2]
|
||||
# connects: [4]
|
||||
'''
|
||||
|
||||
###################################### OVERLAP SCORES
|
||||
cover = 0
|
||||
perimeter = 0
|
||||
# check 0 > 1 > 2 > 3
|
||||
square_length = []
|
||||
|
||||
for start_idx in range(4):
|
||||
end_idx = (start_idx + 1) % 4
|
||||
|
||||
connect_idx = connects[start_idx] # segment idx of segment01
|
||||
start_segments = segments[start_idx]
|
||||
end_segments = segments[end_idx]
|
||||
|
||||
start_point = square[start_idx]
|
||||
end_point = square[end_idx]
|
||||
|
||||
# check whether outside or inside
|
||||
start_position, start_min, start_cover_param, start_peri_param = check_outside_inside(start_segments,
|
||||
connect_idx)
|
||||
end_position, end_min, end_cover_param, end_peri_param = check_outside_inside(end_segments, connect_idx)
|
||||
|
||||
cover += dist_segments[connect_idx] + start_cover_param * start_min + end_cover_param * end_min
|
||||
perimeter += dist_segments[connect_idx] + start_peri_param * start_min + end_peri_param * end_min
|
||||
|
||||
square_length.append(
|
||||
dist_segments[connect_idx] + start_peri_param * start_min + end_peri_param * end_min)
|
||||
|
||||
overlap_scores.append(cover / perimeter)
|
||||
######################################
|
||||
###################################### DEGREE SCORES
|
||||
'''
|
||||
deg0 vs deg2
|
||||
deg1 vs deg3
|
||||
'''
|
||||
deg0, deg1, deg2, deg3 = degree
|
||||
deg_ratio1 = deg0 / deg2
|
||||
if deg_ratio1 > 1.0:
|
||||
deg_ratio1 = 1 / deg_ratio1
|
||||
deg_ratio2 = deg1 / deg3
|
||||
if deg_ratio2 > 1.0:
|
||||
deg_ratio2 = 1 / deg_ratio2
|
||||
degree_scores.append((deg_ratio1 + deg_ratio2) / 2)
|
||||
######################################
|
||||
###################################### LENGTH SCORES
|
||||
'''
|
||||
len0 vs len2
|
||||
len1 vs len3
|
||||
'''
|
||||
len0, len1, len2, len3 = square_length
|
||||
len_ratio1 = len0 / len2 if len2 > len0 else len2 / len0
|
||||
len_ratio2 = len1 / len3 if len3 > len1 else len3 / len1
|
||||
length_scores.append((len_ratio1 + len_ratio2) / 2)
|
||||
|
||||
######################################
|
||||
|
||||
overlap_scores = np.array(overlap_scores)
|
||||
overlap_scores /= np.max(overlap_scores)
|
||||
|
||||
degree_scores = np.array(degree_scores)
|
||||
# degree_scores /= np.max(degree_scores)
|
||||
|
||||
length_scores = np.array(length_scores)
|
||||
|
||||
###################################### AREA SCORES
|
||||
area_scores = np.reshape(squares, [-1, 4, 2])
|
||||
area_x = area_scores[:, :, 0]
|
||||
area_y = area_scores[:, :, 1]
|
||||
correction = area_x[:, -1] * area_y[:, 0] - area_y[:, -1] * area_x[:, 0]
|
||||
area_scores = np.sum(area_x[:, :-1] * area_y[:, 1:], axis=-1) - np.sum(area_y[:, :-1] * area_x[:, 1:], axis=-1)
|
||||
area_scores = 0.5 * np.abs(area_scores + correction)
|
||||
area_scores /= (map_size * map_size) # np.max(area_scores)
|
||||
######################################
|
||||
|
||||
###################################### CENTER SCORES
|
||||
centers = np.array([[256 // 2, 256 // 2]], dtype='float32') # [1, 2]
|
||||
# squares: [n, 4, 2]
|
||||
square_centers = np.mean(squares, axis=1) # [n, 2]
|
||||
center2center = np.sqrt(np.sum((centers - square_centers) ** 2))
|
||||
center_scores = center2center / (map_size / np.sqrt(2.0))
|
||||
|
||||
'''
|
||||
score_w = [overlap, degree, area, center, length]
|
||||
'''
|
||||
score_w = [0.0, 1.0, 10.0, 0.5, 1.0]
|
||||
score_array = params['w_overlap'] * overlap_scores \
|
||||
+ params['w_degree'] * degree_scores \
|
||||
+ params['w_area'] * area_scores \
|
||||
- params['w_center'] * center_scores \
|
||||
+ params['w_length'] * length_scores
|
||||
|
||||
best_square = []
|
||||
|
||||
sorted_idx = np.argsort(score_array)[::-1]
|
||||
score_array = score_array[sorted_idx]
|
||||
squares = squares[sorted_idx]
|
||||
|
||||
except Exception as e:
|
||||
pass
|
||||
|
||||
'''return list
|
||||
merged_lines, squares, scores
|
||||
'''
|
||||
|
||||
try:
|
||||
new_segments[:, 0] = new_segments[:, 0] * 2 / input_shape[1] * original_shape[1]
|
||||
new_segments[:, 1] = new_segments[:, 1] * 2 / input_shape[0] * original_shape[0]
|
||||
new_segments[:, 2] = new_segments[:, 2] * 2 / input_shape[1] * original_shape[1]
|
||||
new_segments[:, 3] = new_segments[:, 3] * 2 / input_shape[0] * original_shape[0]
|
||||
except:
|
||||
new_segments = []
|
||||
|
||||
try:
|
||||
squares[:, :, 0] = squares[:, :, 0] * 2 / input_shape[1] * original_shape[1]
|
||||
squares[:, :, 1] = squares[:, :, 1] * 2 / input_shape[0] * original_shape[0]
|
||||
except:
|
||||
squares = []
|
||||
score_array = []
|
||||
|
||||
try:
|
||||
inter_points = np.array(inter_points)
|
||||
inter_points[:, 0] = inter_points[:, 0] * 2 / input_shape[1] * original_shape[1]
|
||||
inter_points[:, 1] = inter_points[:, 1] * 2 / input_shape[0] * original_shape[0]
|
||||
except:
|
||||
inter_points = []
|
||||
|
||||
return new_segments, squares, score_array, inter_points
|
||||
@@ -0,0 +1,15 @@
|
||||
# Copyright (c) OpenMMLab. All rights reserved.
|
||||
# flake8: noqa
|
||||
from .arraymisc import *
|
||||
from .fileio import *
|
||||
from .image import *
|
||||
from .utils import *
|
||||
from .version import *
|
||||
from .video import *
|
||||
from .visualization import *
|
||||
|
||||
# The following modules are not imported to this level, so mmcv may be used
|
||||
# without PyTorch.
|
||||
# - runner
|
||||
# - parallel
|
||||
# - op
|
||||
@@ -0,0 +1,4 @@
|
||||
# Copyright (c) OpenMMLab. All rights reserved.
|
||||
from .quantization import dequantize, quantize
|
||||
|
||||
__all__ = ['quantize', 'dequantize']
|
||||
@@ -0,0 +1,55 @@
|
||||
# Copyright (c) OpenMMLab. All rights reserved.
|
||||
import numpy as np
|
||||
|
||||
|
||||
def quantize(arr, min_val, max_val, levels, dtype=np.int64):
|
||||
"""Quantize an array of (-inf, inf) to [0, levels-1].
|
||||
|
||||
Args:
|
||||
arr (ndarray): Input array.
|
||||
min_val (scalar): Minimum value to be clipped.
|
||||
max_val (scalar): Maximum value to be clipped.
|
||||
levels (int): Quantization levels.
|
||||
dtype (np.type): The type of the quantized array.
|
||||
|
||||
Returns:
|
||||
tuple: Quantized array.
|
||||
"""
|
||||
if not (isinstance(levels, int) and levels > 1):
|
||||
raise ValueError(
|
||||
f'levels must be a positive integer, but got {levels}')
|
||||
if min_val >= max_val:
|
||||
raise ValueError(
|
||||
f'min_val ({min_val}) must be smaller than max_val ({max_val})')
|
||||
|
||||
arr = np.clip(arr, min_val, max_val) - min_val
|
||||
quantized_arr = np.minimum(
|
||||
np.floor(levels * arr / (max_val - min_val)).astype(dtype), levels - 1)
|
||||
|
||||
return quantized_arr
|
||||
|
||||
|
||||
def dequantize(arr, min_val, max_val, levels, dtype=np.float64):
|
||||
"""Dequantize an array.
|
||||
|
||||
Args:
|
||||
arr (ndarray): Input array.
|
||||
min_val (scalar): Minimum value to be clipped.
|
||||
max_val (scalar): Maximum value to be clipped.
|
||||
levels (int): Quantization levels.
|
||||
dtype (np.type): The type of the dequantized array.
|
||||
|
||||
Returns:
|
||||
tuple: Dequantized array.
|
||||
"""
|
||||
if not (isinstance(levels, int) and levels > 1):
|
||||
raise ValueError(
|
||||
f'levels must be a positive integer, but got {levels}')
|
||||
if min_val >= max_val:
|
||||
raise ValueError(
|
||||
f'min_val ({min_val}) must be smaller than max_val ({max_val})')
|
||||
|
||||
dequantized_arr = (arr + 0.5).astype(dtype) * (max_val -
|
||||
min_val) / levels + min_val
|
||||
|
||||
return dequantized_arr
|
||||
@@ -0,0 +1,41 @@
|
||||
# Copyright (c) OpenMMLab. All rights reserved.
|
||||
from .alexnet import AlexNet
|
||||
# yapf: disable
|
||||
from .bricks import (ACTIVATION_LAYERS, CONV_LAYERS, NORM_LAYERS,
|
||||
PADDING_LAYERS, PLUGIN_LAYERS, UPSAMPLE_LAYERS,
|
||||
ContextBlock, Conv2d, Conv3d, ConvAWS2d, ConvModule,
|
||||
ConvTranspose2d, ConvTranspose3d, ConvWS2d,
|
||||
DepthwiseSeparableConvModule, GeneralizedAttention,
|
||||
HSigmoid, HSwish, Linear, MaxPool2d, MaxPool3d,
|
||||
NonLocal1d, NonLocal2d, NonLocal3d, Scale, Swish,
|
||||
build_activation_layer, build_conv_layer,
|
||||
build_norm_layer, build_padding_layer, build_plugin_layer,
|
||||
build_upsample_layer, conv_ws_2d, is_norm)
|
||||
from .builder import MODELS, build_model_from_cfg
|
||||
# yapf: enable
|
||||
from .resnet import ResNet, make_res_layer
|
||||
from .utils import (INITIALIZERS, Caffe2XavierInit, ConstantInit, KaimingInit,
|
||||
NormalInit, PretrainedInit, TruncNormalInit, UniformInit,
|
||||
XavierInit, bias_init_with_prob, caffe2_xavier_init,
|
||||
constant_init, fuse_conv_bn, get_model_complexity_info,
|
||||
initialize, kaiming_init, normal_init, trunc_normal_init,
|
||||
uniform_init, xavier_init)
|
||||
from .vgg import VGG, make_vgg_layer
|
||||
|
||||
__all__ = [
|
||||
'AlexNet', 'VGG', 'make_vgg_layer', 'ResNet', 'make_res_layer',
|
||||
'constant_init', 'xavier_init', 'normal_init', 'trunc_normal_init',
|
||||
'uniform_init', 'kaiming_init', 'caffe2_xavier_init',
|
||||
'bias_init_with_prob', 'ConvModule', 'build_activation_layer',
|
||||
'build_conv_layer', 'build_norm_layer', 'build_padding_layer',
|
||||
'build_upsample_layer', 'build_plugin_layer', 'is_norm', 'NonLocal1d',
|
||||
'NonLocal2d', 'NonLocal3d', 'ContextBlock', 'HSigmoid', 'Swish', 'HSwish',
|
||||
'GeneralizedAttention', 'ACTIVATION_LAYERS', 'CONV_LAYERS', 'NORM_LAYERS',
|
||||
'PADDING_LAYERS', 'UPSAMPLE_LAYERS', 'PLUGIN_LAYERS', 'Scale',
|
||||
'get_model_complexity_info', 'conv_ws_2d', 'ConvAWS2d', 'ConvWS2d',
|
||||
'fuse_conv_bn', 'DepthwiseSeparableConvModule', 'Linear', 'Conv2d',
|
||||
'ConvTranspose2d', 'MaxPool2d', 'ConvTranspose3d', 'MaxPool3d', 'Conv3d',
|
||||
'initialize', 'INITIALIZERS', 'ConstantInit', 'XavierInit', 'NormalInit',
|
||||
'TruncNormalInit', 'UniformInit', 'KaimingInit', 'PretrainedInit',
|
||||
'Caffe2XavierInit', 'MODELS', 'build_model_from_cfg'
|
||||
]
|
||||
@@ -0,0 +1,61 @@
|
||||
# Copyright (c) OpenMMLab. All rights reserved.
|
||||
import logging
|
||||
|
||||
import torch.nn as nn
|
||||
|
||||
|
||||
class AlexNet(nn.Module):
|
||||
"""AlexNet backbone.
|
||||
|
||||
Args:
|
||||
num_classes (int): number of classes for classification.
|
||||
"""
|
||||
|
||||
def __init__(self, num_classes=-1):
|
||||
super(AlexNet, self).__init__()
|
||||
self.num_classes = num_classes
|
||||
self.features = nn.Sequential(
|
||||
nn.Conv2d(3, 64, kernel_size=11, stride=4, padding=2),
|
||||
nn.ReLU(inplace=True),
|
||||
nn.MaxPool2d(kernel_size=3, stride=2),
|
||||
nn.Conv2d(64, 192, kernel_size=5, padding=2),
|
||||
nn.ReLU(inplace=True),
|
||||
nn.MaxPool2d(kernel_size=3, stride=2),
|
||||
nn.Conv2d(192, 384, kernel_size=3, padding=1),
|
||||
nn.ReLU(inplace=True),
|
||||
nn.Conv2d(384, 256, kernel_size=3, padding=1),
|
||||
nn.ReLU(inplace=True),
|
||||
nn.Conv2d(256, 256, kernel_size=3, padding=1),
|
||||
nn.ReLU(inplace=True),
|
||||
nn.MaxPool2d(kernel_size=3, stride=2),
|
||||
)
|
||||
if self.num_classes > 0:
|
||||
self.classifier = nn.Sequential(
|
||||
nn.Dropout(),
|
||||
nn.Linear(256 * 6 * 6, 4096),
|
||||
nn.ReLU(inplace=True),
|
||||
nn.Dropout(),
|
||||
nn.Linear(4096, 4096),
|
||||
nn.ReLU(inplace=True),
|
||||
nn.Linear(4096, num_classes),
|
||||
)
|
||||
|
||||
def init_weights(self, pretrained=None):
|
||||
if isinstance(pretrained, str):
|
||||
logger = logging.getLogger()
|
||||
from ..runner import load_checkpoint
|
||||
load_checkpoint(self, pretrained, strict=False, logger=logger)
|
||||
elif pretrained is None:
|
||||
# use default initializer
|
||||
pass
|
||||
else:
|
||||
raise TypeError('pretrained must be a str or None')
|
||||
|
||||
def forward(self, x):
|
||||
|
||||
x = self.features(x)
|
||||
if self.num_classes > 0:
|
||||
x = x.view(x.size(0), 256 * 6 * 6)
|
||||
x = self.classifier(x)
|
||||
|
||||
return x
|
||||
@@ -0,0 +1,35 @@
|
||||
# Copyright (c) OpenMMLab. All rights reserved.
|
||||
from .activation import build_activation_layer
|
||||
from .context_block import ContextBlock
|
||||
from .conv import build_conv_layer
|
||||
from .conv2d_adaptive_padding import Conv2dAdaptivePadding
|
||||
from .conv_module import ConvModule
|
||||
from .conv_ws import ConvAWS2d, ConvWS2d, conv_ws_2d
|
||||
from .depthwise_separable_conv_module import DepthwiseSeparableConvModule
|
||||
from .drop import Dropout, DropPath
|
||||
from .generalized_attention import GeneralizedAttention
|
||||
from .hsigmoid import HSigmoid
|
||||
from .hswish import HSwish
|
||||
from .non_local import NonLocal1d, NonLocal2d, NonLocal3d
|
||||
from .norm import build_norm_layer, is_norm
|
||||
from .padding import build_padding_layer
|
||||
from .plugin import build_plugin_layer
|
||||
from .registry import (ACTIVATION_LAYERS, CONV_LAYERS, NORM_LAYERS,
|
||||
PADDING_LAYERS, PLUGIN_LAYERS, UPSAMPLE_LAYERS)
|
||||
from .scale import Scale
|
||||
from .swish import Swish
|
||||
from .upsample import build_upsample_layer
|
||||
from .wrappers import (Conv2d, Conv3d, ConvTranspose2d, ConvTranspose3d,
|
||||
Linear, MaxPool2d, MaxPool3d)
|
||||
|
||||
__all__ = [
|
||||
'ConvModule', 'build_activation_layer', 'build_conv_layer',
|
||||
'build_norm_layer', 'build_padding_layer', 'build_upsample_layer',
|
||||
'build_plugin_layer', 'is_norm', 'HSigmoid', 'HSwish', 'NonLocal1d',
|
||||
'NonLocal2d', 'NonLocal3d', 'ContextBlock', 'GeneralizedAttention',
|
||||
'ACTIVATION_LAYERS', 'CONV_LAYERS', 'NORM_LAYERS', 'PADDING_LAYERS',
|
||||
'UPSAMPLE_LAYERS', 'PLUGIN_LAYERS', 'Scale', 'ConvAWS2d', 'ConvWS2d',
|
||||
'conv_ws_2d', 'DepthwiseSeparableConvModule', 'Swish', 'Linear',
|
||||
'Conv2dAdaptivePadding', 'Conv2d', 'ConvTranspose2d', 'MaxPool2d',
|
||||
'ConvTranspose3d', 'MaxPool3d', 'Conv3d', 'Dropout', 'DropPath'
|
||||
]
|
||||
@@ -0,0 +1,92 @@
|
||||
# Copyright (c) OpenMMLab. All rights reserved.
|
||||
import torch
|
||||
import torch.nn as nn
|
||||
import torch.nn.functional as F
|
||||
|
||||
from annotator.mmpkg.mmcv.utils import TORCH_VERSION, build_from_cfg, digit_version
|
||||
from .registry import ACTIVATION_LAYERS
|
||||
|
||||
for module in [
|
||||
nn.ReLU, nn.LeakyReLU, nn.PReLU, nn.RReLU, nn.ReLU6, nn.ELU,
|
||||
nn.Sigmoid, nn.Tanh
|
||||
]:
|
||||
ACTIVATION_LAYERS.register_module(module=module)
|
||||
|
||||
|
||||
@ACTIVATION_LAYERS.register_module(name='Clip')
|
||||
@ACTIVATION_LAYERS.register_module()
|
||||
class Clamp(nn.Module):
|
||||
"""Clamp activation layer.
|
||||
|
||||
This activation function is to clamp the feature map value within
|
||||
:math:`[min, max]`. More details can be found in ``torch.clamp()``.
|
||||
|
||||
Args:
|
||||
min (Number | optional): Lower-bound of the range to be clamped to.
|
||||
Default to -1.
|
||||
max (Number | optional): Upper-bound of the range to be clamped to.
|
||||
Default to 1.
|
||||
"""
|
||||
|
||||
def __init__(self, min=-1., max=1.):
|
||||
super(Clamp, self).__init__()
|
||||
self.min = min
|
||||
self.max = max
|
||||
|
||||
def forward(self, x):
|
||||
"""Forward function.
|
||||
|
||||
Args:
|
||||
x (torch.Tensor): The input tensor.
|
||||
|
||||
Returns:
|
||||
torch.Tensor: Clamped tensor.
|
||||
"""
|
||||
return torch.clamp(x, min=self.min, max=self.max)
|
||||
|
||||
|
||||
class GELU(nn.Module):
|
||||
r"""Applies the Gaussian Error Linear Units function:
|
||||
|
||||
.. math::
|
||||
\text{GELU}(x) = x * \Phi(x)
|
||||
where :math:`\Phi(x)` is the Cumulative Distribution Function for
|
||||
Gaussian Distribution.
|
||||
|
||||
Shape:
|
||||
- Input: :math:`(N, *)` where `*` means, any number of additional
|
||||
dimensions
|
||||
- Output: :math:`(N, *)`, same shape as the input
|
||||
|
||||
.. image:: scripts/activation_images/GELU.png
|
||||
|
||||
Examples::
|
||||
|
||||
>>> m = nn.GELU()
|
||||
>>> input = torch.randn(2)
|
||||
>>> output = m(input)
|
||||
"""
|
||||
|
||||
def forward(self, input):
|
||||
return F.gelu(input)
|
||||
|
||||
|
||||
if (TORCH_VERSION == 'parrots'
|
||||
or digit_version(TORCH_VERSION) < digit_version('1.4')):
|
||||
ACTIVATION_LAYERS.register_module(module=GELU)
|
||||
else:
|
||||
ACTIVATION_LAYERS.register_module(module=nn.GELU)
|
||||
|
||||
|
||||
def build_activation_layer(cfg):
|
||||
"""Build activation layer.
|
||||
|
||||
Args:
|
||||
cfg (dict): The activation layer config, which should contain:
|
||||
- type (str): Layer type.
|
||||
- layer args: Args needed to instantiate an activation layer.
|
||||
|
||||
Returns:
|
||||
nn.Module: Created activation layer.
|
||||
"""
|
||||
return build_from_cfg(cfg, ACTIVATION_LAYERS)
|
||||
@@ -0,0 +1,125 @@
|
||||
# Copyright (c) OpenMMLab. All rights reserved.
|
||||
import torch
|
||||
from torch import nn
|
||||
|
||||
from ..utils import constant_init, kaiming_init
|
||||
from .registry import PLUGIN_LAYERS
|
||||
|
||||
|
||||
def last_zero_init(m):
|
||||
if isinstance(m, nn.Sequential):
|
||||
constant_init(m[-1], val=0)
|
||||
else:
|
||||
constant_init(m, val=0)
|
||||
|
||||
|
||||
@PLUGIN_LAYERS.register_module()
|
||||
class ContextBlock(nn.Module):
|
||||
"""ContextBlock module in GCNet.
|
||||
|
||||
See 'GCNet: Non-local Networks Meet Squeeze-Excitation Networks and Beyond'
|
||||
(https://arxiv.org/abs/1904.11492) for details.
|
||||
|
||||
Args:
|
||||
in_channels (int): Channels of the input feature map.
|
||||
ratio (float): Ratio of channels of transform bottleneck
|
||||
pooling_type (str): Pooling method for context modeling.
|
||||
Options are 'att' and 'avg', stand for attention pooling and
|
||||
average pooling respectively. Default: 'att'.
|
||||
fusion_types (Sequence[str]): Fusion method for feature fusion,
|
||||
Options are 'channels_add', 'channel_mul', stand for channelwise
|
||||
addition and multiplication respectively. Default: ('channel_add',)
|
||||
"""
|
||||
|
||||
_abbr_ = 'context_block'
|
||||
|
||||
def __init__(self,
|
||||
in_channels,
|
||||
ratio,
|
||||
pooling_type='att',
|
||||
fusion_types=('channel_add', )):
|
||||
super(ContextBlock, self).__init__()
|
||||
assert pooling_type in ['avg', 'att']
|
||||
assert isinstance(fusion_types, (list, tuple))
|
||||
valid_fusion_types = ['channel_add', 'channel_mul']
|
||||
assert all([f in valid_fusion_types for f in fusion_types])
|
||||
assert len(fusion_types) > 0, 'at least one fusion should be used'
|
||||
self.in_channels = in_channels
|
||||
self.ratio = ratio
|
||||
self.planes = int(in_channels * ratio)
|
||||
self.pooling_type = pooling_type
|
||||
self.fusion_types = fusion_types
|
||||
if pooling_type == 'att':
|
||||
self.conv_mask = nn.Conv2d(in_channels, 1, kernel_size=1)
|
||||
self.softmax = nn.Softmax(dim=2)
|
||||
else:
|
||||
self.avg_pool = nn.AdaptiveAvgPool2d(1)
|
||||
if 'channel_add' in fusion_types:
|
||||
self.channel_add_conv = nn.Sequential(
|
||||
nn.Conv2d(self.in_channels, self.planes, kernel_size=1),
|
||||
nn.LayerNorm([self.planes, 1, 1]),
|
||||
nn.ReLU(inplace=True), # yapf: disable
|
||||
nn.Conv2d(self.planes, self.in_channels, kernel_size=1))
|
||||
else:
|
||||
self.channel_add_conv = None
|
||||
if 'channel_mul' in fusion_types:
|
||||
self.channel_mul_conv = nn.Sequential(
|
||||
nn.Conv2d(self.in_channels, self.planes, kernel_size=1),
|
||||
nn.LayerNorm([self.planes, 1, 1]),
|
||||
nn.ReLU(inplace=True), # yapf: disable
|
||||
nn.Conv2d(self.planes, self.in_channels, kernel_size=1))
|
||||
else:
|
||||
self.channel_mul_conv = None
|
||||
self.reset_parameters()
|
||||
|
||||
def reset_parameters(self):
|
||||
if self.pooling_type == 'att':
|
||||
kaiming_init(self.conv_mask, mode='fan_in')
|
||||
self.conv_mask.inited = True
|
||||
|
||||
if self.channel_add_conv is not None:
|
||||
last_zero_init(self.channel_add_conv)
|
||||
if self.channel_mul_conv is not None:
|
||||
last_zero_init(self.channel_mul_conv)
|
||||
|
||||
def spatial_pool(self, x):
|
||||
batch, channel, height, width = x.size()
|
||||
if self.pooling_type == 'att':
|
||||
input_x = x
|
||||
# [N, C, H * W]
|
||||
input_x = input_x.view(batch, channel, height * width)
|
||||
# [N, 1, C, H * W]
|
||||
input_x = input_x.unsqueeze(1)
|
||||
# [N, 1, H, W]
|
||||
context_mask = self.conv_mask(x)
|
||||
# [N, 1, H * W]
|
||||
context_mask = context_mask.view(batch, 1, height * width)
|
||||
# [N, 1, H * W]
|
||||
context_mask = self.softmax(context_mask)
|
||||
# [N, 1, H * W, 1]
|
||||
context_mask = context_mask.unsqueeze(-1)
|
||||
# [N, 1, C, 1]
|
||||
context = torch.matmul(input_x, context_mask)
|
||||
# [N, C, 1, 1]
|
||||
context = context.view(batch, channel, 1, 1)
|
||||
else:
|
||||
# [N, C, 1, 1]
|
||||
context = self.avg_pool(x)
|
||||
|
||||
return context
|
||||
|
||||
def forward(self, x):
|
||||
# [N, C, 1, 1]
|
||||
context = self.spatial_pool(x)
|
||||
|
||||
out = x
|
||||
if self.channel_mul_conv is not None:
|
||||
# [N, C, 1, 1]
|
||||
channel_mul_term = torch.sigmoid(self.channel_mul_conv(context))
|
||||
out = out * channel_mul_term
|
||||
if self.channel_add_conv is not None:
|
||||
# [N, C, 1, 1]
|
||||
channel_add_term = self.channel_add_conv(context)
|
||||
out = out + channel_add_term
|
||||
|
||||
return out
|
||||
@@ -0,0 +1,44 @@
|
||||
# Copyright (c) OpenMMLab. All rights reserved.
|
||||
from torch import nn
|
||||
|
||||
from .registry import CONV_LAYERS
|
||||
|
||||
CONV_LAYERS.register_module('Conv1d', module=nn.Conv1d)
|
||||
CONV_LAYERS.register_module('Conv2d', module=nn.Conv2d)
|
||||
CONV_LAYERS.register_module('Conv3d', module=nn.Conv3d)
|
||||
CONV_LAYERS.register_module('Conv', module=nn.Conv2d)
|
||||
|
||||
|
||||
def build_conv_layer(cfg, *args, **kwargs):
|
||||
"""Build convolution layer.
|
||||
|
||||
Args:
|
||||
cfg (None or dict): The conv layer config, which should contain:
|
||||
- type (str): Layer type.
|
||||
- layer args: Args needed to instantiate an conv layer.
|
||||
args (argument list): Arguments passed to the `__init__`
|
||||
method of the corresponding conv layer.
|
||||
kwargs (keyword arguments): Keyword arguments passed to the `__init__`
|
||||
method of the corresponding conv layer.
|
||||
|
||||
Returns:
|
||||
nn.Module: Created conv layer.
|
||||
"""
|
||||
if cfg is None:
|
||||
cfg_ = dict(type='Conv2d')
|
||||
else:
|
||||
if not isinstance(cfg, dict):
|
||||
raise TypeError('cfg must be a dict')
|
||||
if 'type' not in cfg:
|
||||
raise KeyError('the cfg dict must contain the key "type"')
|
||||
cfg_ = cfg.copy()
|
||||
|
||||
layer_type = cfg_.pop('type')
|
||||
if layer_type not in CONV_LAYERS:
|
||||
raise KeyError(f'Unrecognized norm type {layer_type}')
|
||||
else:
|
||||
conv_layer = CONV_LAYERS.get(layer_type)
|
||||
|
||||
layer = conv_layer(*args, **kwargs, **cfg_)
|
||||
|
||||
return layer
|
||||
@@ -0,0 +1,62 @@
|
||||
# Copyright (c) OpenMMLab. All rights reserved.
|
||||
import math
|
||||
|
||||
from torch import nn
|
||||
from torch.nn import functional as F
|
||||
|
||||
from .registry import CONV_LAYERS
|
||||
|
||||
|
||||
@CONV_LAYERS.register_module()
|
||||
class Conv2dAdaptivePadding(nn.Conv2d):
|
||||
"""Implementation of 2D convolution in tensorflow with `padding` as "same",
|
||||
which applies padding to input (if needed) so that input image gets fully
|
||||
covered by filter and stride you specified. For stride 1, this will ensure
|
||||
that output image size is same as input. For stride of 2, output dimensions
|
||||
will be half, for example.
|
||||
|
||||
Args:
|
||||
in_channels (int): Number of channels in the input image
|
||||
out_channels (int): Number of channels produced by the convolution
|
||||
kernel_size (int or tuple): Size of the convolving kernel
|
||||
stride (int or tuple, optional): Stride of the convolution. Default: 1
|
||||
padding (int or tuple, optional): Zero-padding added to both sides of
|
||||
the input. Default: 0
|
||||
dilation (int or tuple, optional): Spacing between kernel elements.
|
||||
Default: 1
|
||||
groups (int, optional): Number of blocked connections from input
|
||||
channels to output channels. Default: 1
|
||||
bias (bool, optional): If ``True``, adds a learnable bias to the
|
||||
output. Default: ``True``
|
||||
"""
|
||||
|
||||
def __init__(self,
|
||||
in_channels,
|
||||
out_channels,
|
||||
kernel_size,
|
||||
stride=1,
|
||||
padding=0,
|
||||
dilation=1,
|
||||
groups=1,
|
||||
bias=True):
|
||||
super().__init__(in_channels, out_channels, kernel_size, stride, 0,
|
||||
dilation, groups, bias)
|
||||
|
||||
def forward(self, x):
|
||||
img_h, img_w = x.size()[-2:]
|
||||
kernel_h, kernel_w = self.weight.size()[-2:]
|
||||
stride_h, stride_w = self.stride
|
||||
output_h = math.ceil(img_h / stride_h)
|
||||
output_w = math.ceil(img_w / stride_w)
|
||||
pad_h = (
|
||||
max((output_h - 1) * self.stride[0] +
|
||||
(kernel_h - 1) * self.dilation[0] + 1 - img_h, 0))
|
||||
pad_w = (
|
||||
max((output_w - 1) * self.stride[1] +
|
||||
(kernel_w - 1) * self.dilation[1] + 1 - img_w, 0))
|
||||
if pad_h > 0 or pad_w > 0:
|
||||
x = F.pad(x, [
|
||||
pad_w // 2, pad_w - pad_w // 2, pad_h // 2, pad_h - pad_h // 2
|
||||
])
|
||||
return F.conv2d(x, self.weight, self.bias, self.stride, self.padding,
|
||||
self.dilation, self.groups)
|
||||
@@ -0,0 +1,206 @@
|
||||
# Copyright (c) OpenMMLab. All rights reserved.
|
||||
import warnings
|
||||
|
||||
import torch.nn as nn
|
||||
|
||||
from annotator.mmpkg.mmcv.utils import _BatchNorm, _InstanceNorm
|
||||
from ..utils import constant_init, kaiming_init
|
||||
from .activation import build_activation_layer
|
||||
from .conv import build_conv_layer
|
||||
from .norm import build_norm_layer
|
||||
from .padding import build_padding_layer
|
||||
from .registry import PLUGIN_LAYERS
|
||||
|
||||
|
||||
@PLUGIN_LAYERS.register_module()
|
||||
class ConvModule(nn.Module):
|
||||
"""A conv block that bundles conv/norm/activation layers.
|
||||
|
||||
This block simplifies the usage of convolution layers, which are commonly
|
||||
used with a norm layer (e.g., BatchNorm) and activation layer (e.g., ReLU).
|
||||
It is based upon three build methods: `build_conv_layer()`,
|
||||
`build_norm_layer()` and `build_activation_layer()`.
|
||||
|
||||
Besides, we add some additional features in this module.
|
||||
1. Automatically set `bias` of the conv layer.
|
||||
2. Spectral norm is supported.
|
||||
3. More padding modes are supported. Before PyTorch 1.5, nn.Conv2d only
|
||||
supports zero and circular padding, and we add "reflect" padding mode.
|
||||
|
||||
Args:
|
||||
in_channels (int): Number of channels in the input feature map.
|
||||
Same as that in ``nn._ConvNd``.
|
||||
out_channels (int): Number of channels produced by the convolution.
|
||||
Same as that in ``nn._ConvNd``.
|
||||
kernel_size (int | tuple[int]): Size of the convolving kernel.
|
||||
Same as that in ``nn._ConvNd``.
|
||||
stride (int | tuple[int]): Stride of the convolution.
|
||||
Same as that in ``nn._ConvNd``.
|
||||
padding (int | tuple[int]): Zero-padding added to both sides of
|
||||
the input. Same as that in ``nn._ConvNd``.
|
||||
dilation (int | tuple[int]): Spacing between kernel elements.
|
||||
Same as that in ``nn._ConvNd``.
|
||||
groups (int): Number of blocked connections from input channels to
|
||||
output channels. Same as that in ``nn._ConvNd``.
|
||||
bias (bool | str): If specified as `auto`, it will be decided by the
|
||||
norm_cfg. Bias will be set as True if `norm_cfg` is None, otherwise
|
||||
False. Default: "auto".
|
||||
conv_cfg (dict): Config dict for convolution layer. Default: None,
|
||||
which means using conv2d.
|
||||
norm_cfg (dict): Config dict for normalization layer. Default: None.
|
||||
act_cfg (dict): Config dict for activation layer.
|
||||
Default: dict(type='ReLU').
|
||||
inplace (bool): Whether to use inplace mode for activation.
|
||||
Default: True.
|
||||
with_spectral_norm (bool): Whether use spectral norm in conv module.
|
||||
Default: False.
|
||||
padding_mode (str): If the `padding_mode` has not been supported by
|
||||
current `Conv2d` in PyTorch, we will use our own padding layer
|
||||
instead. Currently, we support ['zeros', 'circular'] with official
|
||||
implementation and ['reflect'] with our own implementation.
|
||||
Default: 'zeros'.
|
||||
order (tuple[str]): The order of conv/norm/activation layers. It is a
|
||||
sequence of "conv", "norm" and "act". Common examples are
|
||||
("conv", "norm", "act") and ("act", "conv", "norm").
|
||||
Default: ('conv', 'norm', 'act').
|
||||
"""
|
||||
|
||||
_abbr_ = 'conv_block'
|
||||
|
||||
def __init__(self,
|
||||
in_channels,
|
||||
out_channels,
|
||||
kernel_size,
|
||||
stride=1,
|
||||
padding=0,
|
||||
dilation=1,
|
||||
groups=1,
|
||||
bias='auto',
|
||||
conv_cfg=None,
|
||||
norm_cfg=None,
|
||||
act_cfg=dict(type='ReLU'),
|
||||
inplace=True,
|
||||
with_spectral_norm=False,
|
||||
padding_mode='zeros',
|
||||
order=('conv', 'norm', 'act')):
|
||||
super(ConvModule, self).__init__()
|
||||
assert conv_cfg is None or isinstance(conv_cfg, dict)
|
||||
assert norm_cfg is None or isinstance(norm_cfg, dict)
|
||||
assert act_cfg is None or isinstance(act_cfg, dict)
|
||||
official_padding_mode = ['zeros', 'circular']
|
||||
self.conv_cfg = conv_cfg
|
||||
self.norm_cfg = norm_cfg
|
||||
self.act_cfg = act_cfg
|
||||
self.inplace = inplace
|
||||
self.with_spectral_norm = with_spectral_norm
|
||||
self.with_explicit_padding = padding_mode not in official_padding_mode
|
||||
self.order = order
|
||||
assert isinstance(self.order, tuple) and len(self.order) == 3
|
||||
assert set(order) == set(['conv', 'norm', 'act'])
|
||||
|
||||
self.with_norm = norm_cfg is not None
|
||||
self.with_activation = act_cfg is not None
|
||||
# if the conv layer is before a norm layer, bias is unnecessary.
|
||||
if bias == 'auto':
|
||||
bias = not self.with_norm
|
||||
self.with_bias = bias
|
||||
|
||||
if self.with_explicit_padding:
|
||||
pad_cfg = dict(type=padding_mode)
|
||||
self.padding_layer = build_padding_layer(pad_cfg, padding)
|
||||
|
||||
# reset padding to 0 for conv module
|
||||
conv_padding = 0 if self.with_explicit_padding else padding
|
||||
# build convolution layer
|
||||
self.conv = build_conv_layer(
|
||||
conv_cfg,
|
||||
in_channels,
|
||||
out_channels,
|
||||
kernel_size,
|
||||
stride=stride,
|
||||
padding=conv_padding,
|
||||
dilation=dilation,
|
||||
groups=groups,
|
||||
bias=bias)
|
||||
# export the attributes of self.conv to a higher level for convenience
|
||||
self.in_channels = self.conv.in_channels
|
||||
self.out_channels = self.conv.out_channels
|
||||
self.kernel_size = self.conv.kernel_size
|
||||
self.stride = self.conv.stride
|
||||
self.padding = padding
|
||||
self.dilation = self.conv.dilation
|
||||
self.transposed = self.conv.transposed
|
||||
self.output_padding = self.conv.output_padding
|
||||
self.groups = self.conv.groups
|
||||
|
||||
if self.with_spectral_norm:
|
||||
self.conv = nn.utils.spectral_norm(self.conv)
|
||||
|
||||
# build normalization layers
|
||||
if self.with_norm:
|
||||
# norm layer is after conv layer
|
||||
if order.index('norm') > order.index('conv'):
|
||||
norm_channels = out_channels
|
||||
else:
|
||||
norm_channels = in_channels
|
||||
self.norm_name, norm = build_norm_layer(norm_cfg, norm_channels)
|
||||
self.add_module(self.norm_name, norm)
|
||||
if self.with_bias:
|
||||
if isinstance(norm, (_BatchNorm, _InstanceNorm)):
|
||||
warnings.warn(
|
||||
'Unnecessary conv bias before batch/instance norm')
|
||||
else:
|
||||
self.norm_name = None
|
||||
|
||||
# build activation layer
|
||||
if self.with_activation:
|
||||
act_cfg_ = act_cfg.copy()
|
||||
# nn.Tanh has no 'inplace' argument
|
||||
if act_cfg_['type'] not in [
|
||||
'Tanh', 'PReLU', 'Sigmoid', 'HSigmoid', 'Swish'
|
||||
]:
|
||||
act_cfg_.setdefault('inplace', inplace)
|
||||
self.activate = build_activation_layer(act_cfg_)
|
||||
|
||||
# Use msra init by default
|
||||
self.init_weights()
|
||||
|
||||
@property
|
||||
def norm(self):
|
||||
if self.norm_name:
|
||||
return getattr(self, self.norm_name)
|
||||
else:
|
||||
return None
|
||||
|
||||
def init_weights(self):
|
||||
# 1. It is mainly for customized conv layers with their own
|
||||
# initialization manners by calling their own ``init_weights()``,
|
||||
# and we do not want ConvModule to override the initialization.
|
||||
# 2. For customized conv layers without their own initialization
|
||||
# manners (that is, they don't have their own ``init_weights()``)
|
||||
# and PyTorch's conv layers, they will be initialized by
|
||||
# this method with default ``kaiming_init``.
|
||||
# Note: For PyTorch's conv layers, they will be overwritten by our
|
||||
# initialization implementation using default ``kaiming_init``.
|
||||
if not hasattr(self.conv, 'init_weights'):
|
||||
if self.with_activation and self.act_cfg['type'] == 'LeakyReLU':
|
||||
nonlinearity = 'leaky_relu'
|
||||
a = self.act_cfg.get('negative_slope', 0.01)
|
||||
else:
|
||||
nonlinearity = 'relu'
|
||||
a = 0
|
||||
kaiming_init(self.conv, a=a, nonlinearity=nonlinearity)
|
||||
if self.with_norm:
|
||||
constant_init(self.norm, 1, bias=0)
|
||||
|
||||
def forward(self, x, activate=True, norm=True):
|
||||
for layer in self.order:
|
||||
if layer == 'conv':
|
||||
if self.with_explicit_padding:
|
||||
x = self.padding_layer(x)
|
||||
x = self.conv(x)
|
||||
elif layer == 'norm' and norm and self.with_norm:
|
||||
x = self.norm(x)
|
||||
elif layer == 'act' and activate and self.with_activation:
|
||||
x = self.activate(x)
|
||||
return x
|
||||
@@ -0,0 +1,148 @@
|
||||
# Copyright (c) OpenMMLab. All rights reserved.
|
||||
import torch
|
||||
import torch.nn as nn
|
||||
import torch.nn.functional as F
|
||||
|
||||
from .registry import CONV_LAYERS
|
||||
|
||||
|
||||
def conv_ws_2d(input,
|
||||
weight,
|
||||
bias=None,
|
||||
stride=1,
|
||||
padding=0,
|
||||
dilation=1,
|
||||
groups=1,
|
||||
eps=1e-5):
|
||||
c_in = weight.size(0)
|
||||
weight_flat = weight.view(c_in, -1)
|
||||
mean = weight_flat.mean(dim=1, keepdim=True).view(c_in, 1, 1, 1)
|
||||
std = weight_flat.std(dim=1, keepdim=True).view(c_in, 1, 1, 1)
|
||||
weight = (weight - mean) / (std + eps)
|
||||
return F.conv2d(input, weight, bias, stride, padding, dilation, groups)
|
||||
|
||||
|
||||
@CONV_LAYERS.register_module('ConvWS')
|
||||
class ConvWS2d(nn.Conv2d):
|
||||
|
||||
def __init__(self,
|
||||
in_channels,
|
||||
out_channels,
|
||||
kernel_size,
|
||||
stride=1,
|
||||
padding=0,
|
||||
dilation=1,
|
||||
groups=1,
|
||||
bias=True,
|
||||
eps=1e-5):
|
||||
super(ConvWS2d, self).__init__(
|
||||
in_channels,
|
||||
out_channels,
|
||||
kernel_size,
|
||||
stride=stride,
|
||||
padding=padding,
|
||||
dilation=dilation,
|
||||
groups=groups,
|
||||
bias=bias)
|
||||
self.eps = eps
|
||||
|
||||
def forward(self, x):
|
||||
return conv_ws_2d(x, self.weight, self.bias, self.stride, self.padding,
|
||||
self.dilation, self.groups, self.eps)
|
||||
|
||||
|
||||
@CONV_LAYERS.register_module(name='ConvAWS')
|
||||
class ConvAWS2d(nn.Conv2d):
|
||||
"""AWS (Adaptive Weight Standardization)
|
||||
|
||||
This is a variant of Weight Standardization
|
||||
(https://arxiv.org/pdf/1903.10520.pdf)
|
||||
It is used in DetectoRS to avoid NaN
|
||||
(https://arxiv.org/pdf/2006.02334.pdf)
|
||||
|
||||
Args:
|
||||
in_channels (int): Number of channels in the input image
|
||||
out_channels (int): Number of channels produced by the convolution
|
||||
kernel_size (int or tuple): Size of the conv kernel
|
||||
stride (int or tuple, optional): Stride of the convolution. Default: 1
|
||||
padding (int or tuple, optional): Zero-padding added to both sides of
|
||||
the input. Default: 0
|
||||
dilation (int or tuple, optional): Spacing between kernel elements.
|
||||
Default: 1
|
||||
groups (int, optional): Number of blocked connections from input
|
||||
channels to output channels. Default: 1
|
||||
bias (bool, optional): If set True, adds a learnable bias to the
|
||||
output. Default: True
|
||||
"""
|
||||
|
||||
def __init__(self,
|
||||
in_channels,
|
||||
out_channels,
|
||||
kernel_size,
|
||||
stride=1,
|
||||
padding=0,
|
||||
dilation=1,
|
||||
groups=1,
|
||||
bias=True):
|
||||
super().__init__(
|
||||
in_channels,
|
||||
out_channels,
|
||||
kernel_size,
|
||||
stride=stride,
|
||||
padding=padding,
|
||||
dilation=dilation,
|
||||
groups=groups,
|
||||
bias=bias)
|
||||
self.register_buffer('weight_gamma',
|
||||
torch.ones(self.out_channels, 1, 1, 1))
|
||||
self.register_buffer('weight_beta',
|
||||
torch.zeros(self.out_channels, 1, 1, 1))
|
||||
|
||||
def _get_weight(self, weight):
|
||||
weight_flat = weight.view(weight.size(0), -1)
|
||||
mean = weight_flat.mean(dim=1).view(-1, 1, 1, 1)
|
||||
std = torch.sqrt(weight_flat.var(dim=1) + 1e-5).view(-1, 1, 1, 1)
|
||||
weight = (weight - mean) / std
|
||||
weight = self.weight_gamma * weight + self.weight_beta
|
||||
return weight
|
||||
|
||||
def forward(self, x):
|
||||
weight = self._get_weight(self.weight)
|
||||
return F.conv2d(x, weight, self.bias, self.stride, self.padding,
|
||||
self.dilation, self.groups)
|
||||
|
||||
def _load_from_state_dict(self, state_dict, prefix, local_metadata, strict,
|
||||
missing_keys, unexpected_keys, error_msgs):
|
||||
"""Override default load function.
|
||||
|
||||
AWS overrides the function _load_from_state_dict to recover
|
||||
weight_gamma and weight_beta if they are missing. If weight_gamma and
|
||||
weight_beta are found in the checkpoint, this function will return
|
||||
after super()._load_from_state_dict. Otherwise, it will compute the
|
||||
mean and std of the pretrained weights and store them in weight_beta
|
||||
and weight_gamma.
|
||||
"""
|
||||
|
||||
self.weight_gamma.data.fill_(-1)
|
||||
local_missing_keys = []
|
||||
super()._load_from_state_dict(state_dict, prefix, local_metadata,
|
||||
strict, local_missing_keys,
|
||||
unexpected_keys, error_msgs)
|
||||
if self.weight_gamma.data.mean() > 0:
|
||||
for k in local_missing_keys:
|
||||
missing_keys.append(k)
|
||||
return
|
||||
weight = self.weight.data
|
||||
weight_flat = weight.view(weight.size(0), -1)
|
||||
mean = weight_flat.mean(dim=1).view(-1, 1, 1, 1)
|
||||
std = torch.sqrt(weight_flat.var(dim=1) + 1e-5).view(-1, 1, 1, 1)
|
||||
self.weight_beta.data.copy_(mean)
|
||||
self.weight_gamma.data.copy_(std)
|
||||
missing_gamma_beta = [
|
||||
k for k in local_missing_keys
|
||||
if k.endswith('weight_gamma') or k.endswith('weight_beta')
|
||||
]
|
||||
for k in missing_gamma_beta:
|
||||
local_missing_keys.remove(k)
|
||||
for k in local_missing_keys:
|
||||
missing_keys.append(k)
|
||||
@@ -0,0 +1,96 @@
|
||||
# Copyright (c) OpenMMLab. All rights reserved.
|
||||
import torch.nn as nn
|
||||
|
||||
from .conv_module import ConvModule
|
||||
|
||||
|
||||
class DepthwiseSeparableConvModule(nn.Module):
|
||||
"""Depthwise separable convolution module.
|
||||
|
||||
See https://arxiv.org/pdf/1704.04861.pdf for details.
|
||||
|
||||
This module can replace a ConvModule with the conv block replaced by two
|
||||
conv block: depthwise conv block and pointwise conv block. The depthwise
|
||||
conv block contains depthwise-conv/norm/activation layers. The pointwise
|
||||
conv block contains pointwise-conv/norm/activation layers. It should be
|
||||
noted that there will be norm/activation layer in the depthwise conv block
|
||||
if `norm_cfg` and `act_cfg` are specified.
|
||||
|
||||
Args:
|
||||
in_channels (int): Number of channels in the input feature map.
|
||||
Same as that in ``nn._ConvNd``.
|
||||
out_channels (int): Number of channels produced by the convolution.
|
||||
Same as that in ``nn._ConvNd``.
|
||||
kernel_size (int | tuple[int]): Size of the convolving kernel.
|
||||
Same as that in ``nn._ConvNd``.
|
||||
stride (int | tuple[int]): Stride of the convolution.
|
||||
Same as that in ``nn._ConvNd``. Default: 1.
|
||||
padding (int | tuple[int]): Zero-padding added to both sides of
|
||||
the input. Same as that in ``nn._ConvNd``. Default: 0.
|
||||
dilation (int | tuple[int]): Spacing between kernel elements.
|
||||
Same as that in ``nn._ConvNd``. Default: 1.
|
||||
norm_cfg (dict): Default norm config for both depthwise ConvModule and
|
||||
pointwise ConvModule. Default: None.
|
||||
act_cfg (dict): Default activation config for both depthwise ConvModule
|
||||
and pointwise ConvModule. Default: dict(type='ReLU').
|
||||
dw_norm_cfg (dict): Norm config of depthwise ConvModule. If it is
|
||||
'default', it will be the same as `norm_cfg`. Default: 'default'.
|
||||
dw_act_cfg (dict): Activation config of depthwise ConvModule. If it is
|
||||
'default', it will be the same as `act_cfg`. Default: 'default'.
|
||||
pw_norm_cfg (dict): Norm config of pointwise ConvModule. If it is
|
||||
'default', it will be the same as `norm_cfg`. Default: 'default'.
|
||||
pw_act_cfg (dict): Activation config of pointwise ConvModule. If it is
|
||||
'default', it will be the same as `act_cfg`. Default: 'default'.
|
||||
kwargs (optional): Other shared arguments for depthwise and pointwise
|
||||
ConvModule. See ConvModule for ref.
|
||||
"""
|
||||
|
||||
def __init__(self,
|
||||
in_channels,
|
||||
out_channels,
|
||||
kernel_size,
|
||||
stride=1,
|
||||
padding=0,
|
||||
dilation=1,
|
||||
norm_cfg=None,
|
||||
act_cfg=dict(type='ReLU'),
|
||||
dw_norm_cfg='default',
|
||||
dw_act_cfg='default',
|
||||
pw_norm_cfg='default',
|
||||
pw_act_cfg='default',
|
||||
**kwargs):
|
||||
super(DepthwiseSeparableConvModule, self).__init__()
|
||||
assert 'groups' not in kwargs, 'groups should not be specified'
|
||||
|
||||
# if norm/activation config of depthwise/pointwise ConvModule is not
|
||||
# specified, use default config.
|
||||
dw_norm_cfg = dw_norm_cfg if dw_norm_cfg != 'default' else norm_cfg
|
||||
dw_act_cfg = dw_act_cfg if dw_act_cfg != 'default' else act_cfg
|
||||
pw_norm_cfg = pw_norm_cfg if pw_norm_cfg != 'default' else norm_cfg
|
||||
pw_act_cfg = pw_act_cfg if pw_act_cfg != 'default' else act_cfg
|
||||
|
||||
# depthwise convolution
|
||||
self.depthwise_conv = ConvModule(
|
||||
in_channels,
|
||||
in_channels,
|
||||
kernel_size,
|
||||
stride=stride,
|
||||
padding=padding,
|
||||
dilation=dilation,
|
||||
groups=in_channels,
|
||||
norm_cfg=dw_norm_cfg,
|
||||
act_cfg=dw_act_cfg,
|
||||
**kwargs)
|
||||
|
||||
self.pointwise_conv = ConvModule(
|
||||
in_channels,
|
||||
out_channels,
|
||||
1,
|
||||
norm_cfg=pw_norm_cfg,
|
||||
act_cfg=pw_act_cfg,
|
||||
**kwargs)
|
||||
|
||||
def forward(self, x):
|
||||
x = self.depthwise_conv(x)
|
||||
x = self.pointwise_conv(x)
|
||||
return x
|
||||
@@ -0,0 +1,65 @@
|
||||
# Copyright (c) OpenMMLab. All rights reserved.
|
||||
import torch
|
||||
import torch.nn as nn
|
||||
|
||||
from annotator.mmpkg.mmcv import build_from_cfg
|
||||
from .registry import DROPOUT_LAYERS
|
||||
|
||||
|
||||
def drop_path(x, drop_prob=0., training=False):
|
||||
"""Drop paths (Stochastic Depth) per sample (when applied in main path of
|
||||
residual blocks).
|
||||
|
||||
We follow the implementation
|
||||
https://github.com/rwightman/pytorch-image-models/blob/a2727c1bf78ba0d7b5727f5f95e37fb7f8866b1f/timm/models/layers/drop.py # noqa: E501
|
||||
"""
|
||||
if drop_prob == 0. or not training:
|
||||
return x
|
||||
keep_prob = 1 - drop_prob
|
||||
# handle tensors with different dimensions, not just 4D tensors.
|
||||
shape = (x.shape[0], ) + (1, ) * (x.ndim - 1)
|
||||
random_tensor = keep_prob + torch.rand(
|
||||
shape, dtype=x.dtype, device=x.device)
|
||||
output = x.div(keep_prob) * random_tensor.floor()
|
||||
return output
|
||||
|
||||
|
||||
@DROPOUT_LAYERS.register_module()
|
||||
class DropPath(nn.Module):
|
||||
"""Drop paths (Stochastic Depth) per sample (when applied in main path of
|
||||
residual blocks).
|
||||
|
||||
We follow the implementation
|
||||
https://github.com/rwightman/pytorch-image-models/blob/a2727c1bf78ba0d7b5727f5f95e37fb7f8866b1f/timm/models/layers/drop.py # noqa: E501
|
||||
|
||||
Args:
|
||||
drop_prob (float): Probability of the path to be zeroed. Default: 0.1
|
||||
"""
|
||||
|
||||
def __init__(self, drop_prob=0.1):
|
||||
super(DropPath, self).__init__()
|
||||
self.drop_prob = drop_prob
|
||||
|
||||
def forward(self, x):
|
||||
return drop_path(x, self.drop_prob, self.training)
|
||||
|
||||
|
||||
@DROPOUT_LAYERS.register_module()
|
||||
class Dropout(nn.Dropout):
|
||||
"""A wrapper for ``torch.nn.Dropout``, We rename the ``p`` of
|
||||
``torch.nn.Dropout`` to ``drop_prob`` so as to be consistent with
|
||||
``DropPath``
|
||||
|
||||
Args:
|
||||
drop_prob (float): Probability of the elements to be
|
||||
zeroed. Default: 0.5.
|
||||
inplace (bool): Do the operation inplace or not. Default: False.
|
||||
"""
|
||||
|
||||
def __init__(self, drop_prob=0.5, inplace=False):
|
||||
super().__init__(p=drop_prob, inplace=inplace)
|
||||
|
||||
|
||||
def build_dropout(cfg, default_args=None):
|
||||
"""Builder for drop out layers."""
|
||||
return build_from_cfg(cfg, DROPOUT_LAYERS, default_args)
|
||||
@@ -0,0 +1,412 @@
|
||||
# Copyright (c) OpenMMLab. All rights reserved.
|
||||
import math
|
||||
|
||||
import numpy as np
|
||||
import torch
|
||||
import torch.nn as nn
|
||||
import torch.nn.functional as F
|
||||
|
||||
from ..utils import kaiming_init
|
||||
from .registry import PLUGIN_LAYERS
|
||||
|
||||
|
||||
@PLUGIN_LAYERS.register_module()
|
||||
class GeneralizedAttention(nn.Module):
|
||||
"""GeneralizedAttention module.
|
||||
|
||||
See 'An Empirical Study of Spatial Attention Mechanisms in Deep Networks'
|
||||
(https://arxiv.org/abs/1711.07971) for details.
|
||||
|
||||
Args:
|
||||
in_channels (int): Channels of the input feature map.
|
||||
spatial_range (int): The spatial range. -1 indicates no spatial range
|
||||
constraint. Default: -1.
|
||||
num_heads (int): The head number of empirical_attention module.
|
||||
Default: 9.
|
||||
position_embedding_dim (int): The position embedding dimension.
|
||||
Default: -1.
|
||||
position_magnitude (int): A multiplier acting on coord difference.
|
||||
Default: 1.
|
||||
kv_stride (int): The feature stride acting on key/value feature map.
|
||||
Default: 2.
|
||||
q_stride (int): The feature stride acting on query feature map.
|
||||
Default: 1.
|
||||
attention_type (str): A binary indicator string for indicating which
|
||||
items in generalized empirical_attention module are used.
|
||||
Default: '1111'.
|
||||
|
||||
- '1000' indicates 'query and key content' (appr - appr) item,
|
||||
- '0100' indicates 'query content and relative position'
|
||||
(appr - position) item,
|
||||
- '0010' indicates 'key content only' (bias - appr) item,
|
||||
- '0001' indicates 'relative position only' (bias - position) item.
|
||||
"""
|
||||
|
||||
_abbr_ = 'gen_attention_block'
|
||||
|
||||
def __init__(self,
|
||||
in_channels,
|
||||
spatial_range=-1,
|
||||
num_heads=9,
|
||||
position_embedding_dim=-1,
|
||||
position_magnitude=1,
|
||||
kv_stride=2,
|
||||
q_stride=1,
|
||||
attention_type='1111'):
|
||||
|
||||
super(GeneralizedAttention, self).__init__()
|
||||
|
||||
# hard range means local range for non-local operation
|
||||
self.position_embedding_dim = (
|
||||
position_embedding_dim
|
||||
if position_embedding_dim > 0 else in_channels)
|
||||
|
||||
self.position_magnitude = position_magnitude
|
||||
self.num_heads = num_heads
|
||||
self.in_channels = in_channels
|
||||
self.spatial_range = spatial_range
|
||||
self.kv_stride = kv_stride
|
||||
self.q_stride = q_stride
|
||||
self.attention_type = [bool(int(_)) for _ in attention_type]
|
||||
self.qk_embed_dim = in_channels // num_heads
|
||||
out_c = self.qk_embed_dim * num_heads
|
||||
|
||||
if self.attention_type[0] or self.attention_type[1]:
|
||||
self.query_conv = nn.Conv2d(
|
||||
in_channels=in_channels,
|
||||
out_channels=out_c,
|
||||
kernel_size=1,
|
||||
bias=False)
|
||||
self.query_conv.kaiming_init = True
|
||||
|
||||
if self.attention_type[0] or self.attention_type[2]:
|
||||
self.key_conv = nn.Conv2d(
|
||||
in_channels=in_channels,
|
||||
out_channels=out_c,
|
||||
kernel_size=1,
|
||||
bias=False)
|
||||
self.key_conv.kaiming_init = True
|
||||
|
||||
self.v_dim = in_channels // num_heads
|
||||
self.value_conv = nn.Conv2d(
|
||||
in_channels=in_channels,
|
||||
out_channels=self.v_dim * num_heads,
|
||||
kernel_size=1,
|
||||
bias=False)
|
||||
self.value_conv.kaiming_init = True
|
||||
|
||||
if self.attention_type[1] or self.attention_type[3]:
|
||||
self.appr_geom_fc_x = nn.Linear(
|
||||
self.position_embedding_dim // 2, out_c, bias=False)
|
||||
self.appr_geom_fc_x.kaiming_init = True
|
||||
|
||||
self.appr_geom_fc_y = nn.Linear(
|
||||
self.position_embedding_dim // 2, out_c, bias=False)
|
||||
self.appr_geom_fc_y.kaiming_init = True
|
||||
|
||||
if self.attention_type[2]:
|
||||
stdv = 1.0 / math.sqrt(self.qk_embed_dim * 2)
|
||||
appr_bias_value = -2 * stdv * torch.rand(out_c) + stdv
|
||||
self.appr_bias = nn.Parameter(appr_bias_value)
|
||||
|
||||
if self.attention_type[3]:
|
||||
stdv = 1.0 / math.sqrt(self.qk_embed_dim * 2)
|
||||
geom_bias_value = -2 * stdv * torch.rand(out_c) + stdv
|
||||
self.geom_bias = nn.Parameter(geom_bias_value)
|
||||
|
||||
self.proj_conv = nn.Conv2d(
|
||||
in_channels=self.v_dim * num_heads,
|
||||
out_channels=in_channels,
|
||||
kernel_size=1,
|
||||
bias=True)
|
||||
self.proj_conv.kaiming_init = True
|
||||
self.gamma = nn.Parameter(torch.zeros(1))
|
||||
|
||||
if self.spatial_range >= 0:
|
||||
# only works when non local is after 3*3 conv
|
||||
if in_channels == 256:
|
||||
max_len = 84
|
||||
elif in_channels == 512:
|
||||
max_len = 42
|
||||
|
||||
max_len_kv = int((max_len - 1.0) / self.kv_stride + 1)
|
||||
local_constraint_map = np.ones(
|
||||
(max_len, max_len, max_len_kv, max_len_kv), dtype=np.int)
|
||||
for iy in range(max_len):
|
||||
for ix in range(max_len):
|
||||
local_constraint_map[
|
||||
iy, ix,
|
||||
max((iy - self.spatial_range) //
|
||||
self.kv_stride, 0):min((iy + self.spatial_range +
|
||||
1) // self.kv_stride +
|
||||
1, max_len),
|
||||
max((ix - self.spatial_range) //
|
||||
self.kv_stride, 0):min((ix + self.spatial_range +
|
||||
1) // self.kv_stride +
|
||||
1, max_len)] = 0
|
||||
|
||||
self.local_constraint_map = nn.Parameter(
|
||||
torch.from_numpy(local_constraint_map).byte(),
|
||||
requires_grad=False)
|
||||
|
||||
if self.q_stride > 1:
|
||||
self.q_downsample = nn.AvgPool2d(
|
||||
kernel_size=1, stride=self.q_stride)
|
||||
else:
|
||||
self.q_downsample = None
|
||||
|
||||
if self.kv_stride > 1:
|
||||
self.kv_downsample = nn.AvgPool2d(
|
||||
kernel_size=1, stride=self.kv_stride)
|
||||
else:
|
||||
self.kv_downsample = None
|
||||
|
||||
self.init_weights()
|
||||
|
||||
def get_position_embedding(self,
|
||||
h,
|
||||
w,
|
||||
h_kv,
|
||||
w_kv,
|
||||
q_stride,
|
||||
kv_stride,
|
||||
device,
|
||||
dtype,
|
||||
feat_dim,
|
||||
wave_length=1000):
|
||||
# the default type of Tensor is float32, leading to type mismatch
|
||||
# in fp16 mode. Cast it to support fp16 mode.
|
||||
h_idxs = torch.linspace(0, h - 1, h).to(device=device, dtype=dtype)
|
||||
h_idxs = h_idxs.view((h, 1)) * q_stride
|
||||
|
||||
w_idxs = torch.linspace(0, w - 1, w).to(device=device, dtype=dtype)
|
||||
w_idxs = w_idxs.view((w, 1)) * q_stride
|
||||
|
||||
h_kv_idxs = torch.linspace(0, h_kv - 1, h_kv).to(
|
||||
device=device, dtype=dtype)
|
||||
h_kv_idxs = h_kv_idxs.view((h_kv, 1)) * kv_stride
|
||||
|
||||
w_kv_idxs = torch.linspace(0, w_kv - 1, w_kv).to(
|
||||
device=device, dtype=dtype)
|
||||
w_kv_idxs = w_kv_idxs.view((w_kv, 1)) * kv_stride
|
||||
|
||||
# (h, h_kv, 1)
|
||||
h_diff = h_idxs.unsqueeze(1) - h_kv_idxs.unsqueeze(0)
|
||||
h_diff *= self.position_magnitude
|
||||
|
||||
# (w, w_kv, 1)
|
||||
w_diff = w_idxs.unsqueeze(1) - w_kv_idxs.unsqueeze(0)
|
||||
w_diff *= self.position_magnitude
|
||||
|
||||
feat_range = torch.arange(0, feat_dim / 4).to(
|
||||
device=device, dtype=dtype)
|
||||
|
||||
dim_mat = torch.Tensor([wave_length]).to(device=device, dtype=dtype)
|
||||
dim_mat = dim_mat**((4. / feat_dim) * feat_range)
|
||||
dim_mat = dim_mat.view((1, 1, -1))
|
||||
|
||||
embedding_x = torch.cat(
|
||||
((w_diff / dim_mat).sin(), (w_diff / dim_mat).cos()), dim=2)
|
||||
|
||||
embedding_y = torch.cat(
|
||||
((h_diff / dim_mat).sin(), (h_diff / dim_mat).cos()), dim=2)
|
||||
|
||||
return embedding_x, embedding_y
|
||||
|
||||
def forward(self, x_input):
|
||||
num_heads = self.num_heads
|
||||
|
||||
# use empirical_attention
|
||||
if self.q_downsample is not None:
|
||||
x_q = self.q_downsample(x_input)
|
||||
else:
|
||||
x_q = x_input
|
||||
n, _, h, w = x_q.shape
|
||||
|
||||
if self.kv_downsample is not None:
|
||||
x_kv = self.kv_downsample(x_input)
|
||||
else:
|
||||
x_kv = x_input
|
||||
_, _, h_kv, w_kv = x_kv.shape
|
||||
|
||||
if self.attention_type[0] or self.attention_type[1]:
|
||||
proj_query = self.query_conv(x_q).view(
|
||||
(n, num_heads, self.qk_embed_dim, h * w))
|
||||
proj_query = proj_query.permute(0, 1, 3, 2)
|
||||
|
||||
if self.attention_type[0] or self.attention_type[2]:
|
||||
proj_key = self.key_conv(x_kv).view(
|
||||
(n, num_heads, self.qk_embed_dim, h_kv * w_kv))
|
||||
|
||||
if self.attention_type[1] or self.attention_type[3]:
|
||||
position_embed_x, position_embed_y = self.get_position_embedding(
|
||||
h, w, h_kv, w_kv, self.q_stride, self.kv_stride,
|
||||
x_input.device, x_input.dtype, self.position_embedding_dim)
|
||||
# (n, num_heads, w, w_kv, dim)
|
||||
position_feat_x = self.appr_geom_fc_x(position_embed_x).\
|
||||
view(1, w, w_kv, num_heads, self.qk_embed_dim).\
|
||||
permute(0, 3, 1, 2, 4).\
|
||||
repeat(n, 1, 1, 1, 1)
|
||||
|
||||
# (n, num_heads, h, h_kv, dim)
|
||||
position_feat_y = self.appr_geom_fc_y(position_embed_y).\
|
||||
view(1, h, h_kv, num_heads, self.qk_embed_dim).\
|
||||
permute(0, 3, 1, 2, 4).\
|
||||
repeat(n, 1, 1, 1, 1)
|
||||
|
||||
position_feat_x /= math.sqrt(2)
|
||||
position_feat_y /= math.sqrt(2)
|
||||
|
||||
# accelerate for saliency only
|
||||
if (np.sum(self.attention_type) == 1) and self.attention_type[2]:
|
||||
appr_bias = self.appr_bias.\
|
||||
view(1, num_heads, 1, self.qk_embed_dim).\
|
||||
repeat(n, 1, 1, 1)
|
||||
|
||||
energy = torch.matmul(appr_bias, proj_key).\
|
||||
view(n, num_heads, 1, h_kv * w_kv)
|
||||
|
||||
h = 1
|
||||
w = 1
|
||||
else:
|
||||
# (n, num_heads, h*w, h_kv*w_kv), query before key, 540mb for
|
||||
if not self.attention_type[0]:
|
||||
energy = torch.zeros(
|
||||
n,
|
||||
num_heads,
|
||||
h,
|
||||
w,
|
||||
h_kv,
|
||||
w_kv,
|
||||
dtype=x_input.dtype,
|
||||
device=x_input.device)
|
||||
|
||||
# attention_type[0]: appr - appr
|
||||
# attention_type[1]: appr - position
|
||||
# attention_type[2]: bias - appr
|
||||
# attention_type[3]: bias - position
|
||||
if self.attention_type[0] or self.attention_type[2]:
|
||||
if self.attention_type[0] and self.attention_type[2]:
|
||||
appr_bias = self.appr_bias.\
|
||||
view(1, num_heads, 1, self.qk_embed_dim)
|
||||
energy = torch.matmul(proj_query + appr_bias, proj_key).\
|
||||
view(n, num_heads, h, w, h_kv, w_kv)
|
||||
|
||||
elif self.attention_type[0]:
|
||||
energy = torch.matmul(proj_query, proj_key).\
|
||||
view(n, num_heads, h, w, h_kv, w_kv)
|
||||
|
||||
elif self.attention_type[2]:
|
||||
appr_bias = self.appr_bias.\
|
||||
view(1, num_heads, 1, self.qk_embed_dim).\
|
||||
repeat(n, 1, 1, 1)
|
||||
|
||||
energy += torch.matmul(appr_bias, proj_key).\
|
||||
view(n, num_heads, 1, 1, h_kv, w_kv)
|
||||
|
||||
if self.attention_type[1] or self.attention_type[3]:
|
||||
if self.attention_type[1] and self.attention_type[3]:
|
||||
geom_bias = self.geom_bias.\
|
||||
view(1, num_heads, 1, self.qk_embed_dim)
|
||||
|
||||
proj_query_reshape = (proj_query + geom_bias).\
|
||||
view(n, num_heads, h, w, self.qk_embed_dim)
|
||||
|
||||
energy_x = torch.matmul(
|
||||
proj_query_reshape.permute(0, 1, 3, 2, 4),
|
||||
position_feat_x.permute(0, 1, 2, 4, 3))
|
||||
energy_x = energy_x.\
|
||||
permute(0, 1, 3, 2, 4).unsqueeze(4)
|
||||
|
||||
energy_y = torch.matmul(
|
||||
proj_query_reshape,
|
||||
position_feat_y.permute(0, 1, 2, 4, 3))
|
||||
energy_y = energy_y.unsqueeze(5)
|
||||
|
||||
energy += energy_x + energy_y
|
||||
|
||||
elif self.attention_type[1]:
|
||||
proj_query_reshape = proj_query.\
|
||||
view(n, num_heads, h, w, self.qk_embed_dim)
|
||||
proj_query_reshape = proj_query_reshape.\
|
||||
permute(0, 1, 3, 2, 4)
|
||||
position_feat_x_reshape = position_feat_x.\
|
||||
permute(0, 1, 2, 4, 3)
|
||||
position_feat_y_reshape = position_feat_y.\
|
||||
permute(0, 1, 2, 4, 3)
|
||||
|
||||
energy_x = torch.matmul(proj_query_reshape,
|
||||
position_feat_x_reshape)
|
||||
energy_x = energy_x.permute(0, 1, 3, 2, 4).unsqueeze(4)
|
||||
|
||||
energy_y = torch.matmul(proj_query_reshape,
|
||||
position_feat_y_reshape)
|
||||
energy_y = energy_y.unsqueeze(5)
|
||||
|
||||
energy += energy_x + energy_y
|
||||
|
||||
elif self.attention_type[3]:
|
||||
geom_bias = self.geom_bias.\
|
||||
view(1, num_heads, self.qk_embed_dim, 1).\
|
||||
repeat(n, 1, 1, 1)
|
||||
|
||||
position_feat_x_reshape = position_feat_x.\
|
||||
view(n, num_heads, w*w_kv, self.qk_embed_dim)
|
||||
|
||||
position_feat_y_reshape = position_feat_y.\
|
||||
view(n, num_heads, h * h_kv, self.qk_embed_dim)
|
||||
|
||||
energy_x = torch.matmul(position_feat_x_reshape, geom_bias)
|
||||
energy_x = energy_x.view(n, num_heads, 1, w, 1, w_kv)
|
||||
|
||||
energy_y = torch.matmul(position_feat_y_reshape, geom_bias)
|
||||
energy_y = energy_y.view(n, num_heads, h, 1, h_kv, 1)
|
||||
|
||||
energy += energy_x + energy_y
|
||||
|
||||
energy = energy.view(n, num_heads, h * w, h_kv * w_kv)
|
||||
|
||||
if self.spatial_range >= 0:
|
||||
cur_local_constraint_map = \
|
||||
self.local_constraint_map[:h, :w, :h_kv, :w_kv].\
|
||||
contiguous().\
|
||||
view(1, 1, h*w, h_kv*w_kv)
|
||||
|
||||
energy = energy.masked_fill_(cur_local_constraint_map,
|
||||
float('-inf'))
|
||||
|
||||
attention = F.softmax(energy, 3)
|
||||
|
||||
proj_value = self.value_conv(x_kv)
|
||||
proj_value_reshape = proj_value.\
|
||||
view((n, num_heads, self.v_dim, h_kv * w_kv)).\
|
||||
permute(0, 1, 3, 2)
|
||||
|
||||
out = torch.matmul(attention, proj_value_reshape).\
|
||||
permute(0, 1, 3, 2).\
|
||||
contiguous().\
|
||||
view(n, self.v_dim * self.num_heads, h, w)
|
||||
|
||||
out = self.proj_conv(out)
|
||||
|
||||
# output is downsampled, upsample back to input size
|
||||
if self.q_downsample is not None:
|
||||
out = F.interpolate(
|
||||
out,
|
||||
size=x_input.shape[2:],
|
||||
mode='bilinear',
|
||||
align_corners=False)
|
||||
|
||||
out = self.gamma * out + x_input
|
||||
return out
|
||||
|
||||
def init_weights(self):
|
||||
for m in self.modules():
|
||||
if hasattr(m, 'kaiming_init') and m.kaiming_init:
|
||||
kaiming_init(
|
||||
m,
|
||||
mode='fan_in',
|
||||
nonlinearity='leaky_relu',
|
||||
bias=0,
|
||||
distribution='uniform',
|
||||
a=1)
|
||||
@@ -0,0 +1,34 @@
|
||||
# Copyright (c) OpenMMLab. All rights reserved.
|
||||
import torch.nn as nn
|
||||
|
||||
from .registry import ACTIVATION_LAYERS
|
||||
|
||||
|
||||
@ACTIVATION_LAYERS.register_module()
|
||||
class HSigmoid(nn.Module):
|
||||
"""Hard Sigmoid Module. Apply the hard sigmoid function:
|
||||
Hsigmoid(x) = min(max((x + bias) / divisor, min_value), max_value)
|
||||
Default: Hsigmoid(x) = min(max((x + 1) / 2, 0), 1)
|
||||
|
||||
Args:
|
||||
bias (float): Bias of the input feature map. Default: 1.0.
|
||||
divisor (float): Divisor of the input feature map. Default: 2.0.
|
||||
min_value (float): Lower bound value. Default: 0.0.
|
||||
max_value (float): Upper bound value. Default: 1.0.
|
||||
|
||||
Returns:
|
||||
Tensor: The output tensor.
|
||||
"""
|
||||
|
||||
def __init__(self, bias=1.0, divisor=2.0, min_value=0.0, max_value=1.0):
|
||||
super(HSigmoid, self).__init__()
|
||||
self.bias = bias
|
||||
self.divisor = divisor
|
||||
assert self.divisor != 0
|
||||
self.min_value = min_value
|
||||
self.max_value = max_value
|
||||
|
||||
def forward(self, x):
|
||||
x = (x + self.bias) / self.divisor
|
||||
|
||||
return x.clamp_(self.min_value, self.max_value)
|
||||
@@ -0,0 +1,29 @@
|
||||
# Copyright (c) OpenMMLab. All rights reserved.
|
||||
import torch.nn as nn
|
||||
|
||||
from .registry import ACTIVATION_LAYERS
|
||||
|
||||
|
||||
@ACTIVATION_LAYERS.register_module()
|
||||
class HSwish(nn.Module):
|
||||
"""Hard Swish Module.
|
||||
|
||||
This module applies the hard swish function:
|
||||
|
||||
.. math::
|
||||
Hswish(x) = x * ReLU6(x + 3) / 6
|
||||
|
||||
Args:
|
||||
inplace (bool): can optionally do the operation in-place.
|
||||
Default: False.
|
||||
|
||||
Returns:
|
||||
Tensor: The output tensor.
|
||||
"""
|
||||
|
||||
def __init__(self, inplace=False):
|
||||
super(HSwish, self).__init__()
|
||||
self.act = nn.ReLU6(inplace)
|
||||
|
||||
def forward(self, x):
|
||||
return x * self.act(x + 3) / 6
|
||||
@@ -0,0 +1,306 @@
|
||||
# Copyright (c) OpenMMLab. All rights reserved.
|
||||
from abc import ABCMeta
|
||||
|
||||
import torch
|
||||
import torch.nn as nn
|
||||
|
||||
from ..utils import constant_init, normal_init
|
||||
from .conv_module import ConvModule
|
||||
from .registry import PLUGIN_LAYERS
|
||||
|
||||
|
||||
class _NonLocalNd(nn.Module, metaclass=ABCMeta):
|
||||
"""Basic Non-local module.
|
||||
|
||||
This module is proposed in
|
||||
"Non-local Neural Networks"
|
||||
Paper reference: https://arxiv.org/abs/1711.07971
|
||||
Code reference: https://github.com/AlexHex7/Non-local_pytorch
|
||||
|
||||
Args:
|
||||
in_channels (int): Channels of the input feature map.
|
||||
reduction (int): Channel reduction ratio. Default: 2.
|
||||
use_scale (bool): Whether to scale pairwise_weight by
|
||||
`1/sqrt(inter_channels)` when the mode is `embedded_gaussian`.
|
||||
Default: True.
|
||||
conv_cfg (None | dict): The config dict for convolution layers.
|
||||
If not specified, it will use `nn.Conv2d` for convolution layers.
|
||||
Default: None.
|
||||
norm_cfg (None | dict): The config dict for normalization layers.
|
||||
Default: None. (This parameter is only applicable to conv_out.)
|
||||
mode (str): Options are `gaussian`, `concatenation`,
|
||||
`embedded_gaussian` and `dot_product`. Default: embedded_gaussian.
|
||||
"""
|
||||
|
||||
def __init__(self,
|
||||
in_channels,
|
||||
reduction=2,
|
||||
use_scale=True,
|
||||
conv_cfg=None,
|
||||
norm_cfg=None,
|
||||
mode='embedded_gaussian',
|
||||
**kwargs):
|
||||
super(_NonLocalNd, self).__init__()
|
||||
self.in_channels = in_channels
|
||||
self.reduction = reduction
|
||||
self.use_scale = use_scale
|
||||
self.inter_channels = max(in_channels // reduction, 1)
|
||||
self.mode = mode
|
||||
|
||||
if mode not in [
|
||||
'gaussian', 'embedded_gaussian', 'dot_product', 'concatenation'
|
||||
]:
|
||||
raise ValueError("Mode should be in 'gaussian', 'concatenation', "
|
||||
f"'embedded_gaussian' or 'dot_product', but got "
|
||||
f'{mode} instead.')
|
||||
|
||||
# g, theta, phi are defaulted as `nn.ConvNd`.
|
||||
# Here we use ConvModule for potential usage.
|
||||
self.g = ConvModule(
|
||||
self.in_channels,
|
||||
self.inter_channels,
|
||||
kernel_size=1,
|
||||
conv_cfg=conv_cfg,
|
||||
act_cfg=None)
|
||||
self.conv_out = ConvModule(
|
||||
self.inter_channels,
|
||||
self.in_channels,
|
||||
kernel_size=1,
|
||||
conv_cfg=conv_cfg,
|
||||
norm_cfg=norm_cfg,
|
||||
act_cfg=None)
|
||||
|
||||
if self.mode != 'gaussian':
|
||||
self.theta = ConvModule(
|
||||
self.in_channels,
|
||||
self.inter_channels,
|
||||
kernel_size=1,
|
||||
conv_cfg=conv_cfg,
|
||||
act_cfg=None)
|
||||
self.phi = ConvModule(
|
||||
self.in_channels,
|
||||
self.inter_channels,
|
||||
kernel_size=1,
|
||||
conv_cfg=conv_cfg,
|
||||
act_cfg=None)
|
||||
|
||||
if self.mode == 'concatenation':
|
||||
self.concat_project = ConvModule(
|
||||
self.inter_channels * 2,
|
||||
1,
|
||||
kernel_size=1,
|
||||
stride=1,
|
||||
padding=0,
|
||||
bias=False,
|
||||
act_cfg=dict(type='ReLU'))
|
||||
|
||||
self.init_weights(**kwargs)
|
||||
|
||||
def init_weights(self, std=0.01, zeros_init=True):
|
||||
if self.mode != 'gaussian':
|
||||
for m in [self.g, self.theta, self.phi]:
|
||||
normal_init(m.conv, std=std)
|
||||
else:
|
||||
normal_init(self.g.conv, std=std)
|
||||
if zeros_init:
|
||||
if self.conv_out.norm_cfg is None:
|
||||
constant_init(self.conv_out.conv, 0)
|
||||
else:
|
||||
constant_init(self.conv_out.norm, 0)
|
||||
else:
|
||||
if self.conv_out.norm_cfg is None:
|
||||
normal_init(self.conv_out.conv, std=std)
|
||||
else:
|
||||
normal_init(self.conv_out.norm, std=std)
|
||||
|
||||
def gaussian(self, theta_x, phi_x):
|
||||
# NonLocal1d pairwise_weight: [N, H, H]
|
||||
# NonLocal2d pairwise_weight: [N, HxW, HxW]
|
||||
# NonLocal3d pairwise_weight: [N, TxHxW, TxHxW]
|
||||
pairwise_weight = torch.matmul(theta_x, phi_x)
|
||||
pairwise_weight = pairwise_weight.softmax(dim=-1)
|
||||
return pairwise_weight
|
||||
|
||||
def embedded_gaussian(self, theta_x, phi_x):
|
||||
# NonLocal1d pairwise_weight: [N, H, H]
|
||||
# NonLocal2d pairwise_weight: [N, HxW, HxW]
|
||||
# NonLocal3d pairwise_weight: [N, TxHxW, TxHxW]
|
||||
pairwise_weight = torch.matmul(theta_x, phi_x)
|
||||
if self.use_scale:
|
||||
# theta_x.shape[-1] is `self.inter_channels`
|
||||
pairwise_weight /= theta_x.shape[-1]**0.5
|
||||
pairwise_weight = pairwise_weight.softmax(dim=-1)
|
||||
return pairwise_weight
|
||||
|
||||
def dot_product(self, theta_x, phi_x):
|
||||
# NonLocal1d pairwise_weight: [N, H, H]
|
||||
# NonLocal2d pairwise_weight: [N, HxW, HxW]
|
||||
# NonLocal3d pairwise_weight: [N, TxHxW, TxHxW]
|
||||
pairwise_weight = torch.matmul(theta_x, phi_x)
|
||||
pairwise_weight /= pairwise_weight.shape[-1]
|
||||
return pairwise_weight
|
||||
|
||||
def concatenation(self, theta_x, phi_x):
|
||||
# NonLocal1d pairwise_weight: [N, H, H]
|
||||
# NonLocal2d pairwise_weight: [N, HxW, HxW]
|
||||
# NonLocal3d pairwise_weight: [N, TxHxW, TxHxW]
|
||||
h = theta_x.size(2)
|
||||
w = phi_x.size(3)
|
||||
theta_x = theta_x.repeat(1, 1, 1, w)
|
||||
phi_x = phi_x.repeat(1, 1, h, 1)
|
||||
|
||||
concat_feature = torch.cat([theta_x, phi_x], dim=1)
|
||||
pairwise_weight = self.concat_project(concat_feature)
|
||||
n, _, h, w = pairwise_weight.size()
|
||||
pairwise_weight = pairwise_weight.view(n, h, w)
|
||||
pairwise_weight /= pairwise_weight.shape[-1]
|
||||
|
||||
return pairwise_weight
|
||||
|
||||
def forward(self, x):
|
||||
# Assume `reduction = 1`, then `inter_channels = C`
|
||||
# or `inter_channels = C` when `mode="gaussian"`
|
||||
|
||||
# NonLocal1d x: [N, C, H]
|
||||
# NonLocal2d x: [N, C, H, W]
|
||||
# NonLocal3d x: [N, C, T, H, W]
|
||||
n = x.size(0)
|
||||
|
||||
# NonLocal1d g_x: [N, H, C]
|
||||
# NonLocal2d g_x: [N, HxW, C]
|
||||
# NonLocal3d g_x: [N, TxHxW, C]
|
||||
g_x = self.g(x).view(n, self.inter_channels, -1)
|
||||
g_x = g_x.permute(0, 2, 1)
|
||||
|
||||
# NonLocal1d theta_x: [N, H, C], phi_x: [N, C, H]
|
||||
# NonLocal2d theta_x: [N, HxW, C], phi_x: [N, C, HxW]
|
||||
# NonLocal3d theta_x: [N, TxHxW, C], phi_x: [N, C, TxHxW]
|
||||
if self.mode == 'gaussian':
|
||||
theta_x = x.view(n, self.in_channels, -1)
|
||||
theta_x = theta_x.permute(0, 2, 1)
|
||||
if self.sub_sample:
|
||||
phi_x = self.phi(x).view(n, self.in_channels, -1)
|
||||
else:
|
||||
phi_x = x.view(n, self.in_channels, -1)
|
||||
elif self.mode == 'concatenation':
|
||||
theta_x = self.theta(x).view(n, self.inter_channels, -1, 1)
|
||||
phi_x = self.phi(x).view(n, self.inter_channels, 1, -1)
|
||||
else:
|
||||
theta_x = self.theta(x).view(n, self.inter_channels, -1)
|
||||
theta_x = theta_x.permute(0, 2, 1)
|
||||
phi_x = self.phi(x).view(n, self.inter_channels, -1)
|
||||
|
||||
pairwise_func = getattr(self, self.mode)
|
||||
# NonLocal1d pairwise_weight: [N, H, H]
|
||||
# NonLocal2d pairwise_weight: [N, HxW, HxW]
|
||||
# NonLocal3d pairwise_weight: [N, TxHxW, TxHxW]
|
||||
pairwise_weight = pairwise_func(theta_x, phi_x)
|
||||
|
||||
# NonLocal1d y: [N, H, C]
|
||||
# NonLocal2d y: [N, HxW, C]
|
||||
# NonLocal3d y: [N, TxHxW, C]
|
||||
y = torch.matmul(pairwise_weight, g_x)
|
||||
# NonLocal1d y: [N, C, H]
|
||||
# NonLocal2d y: [N, C, H, W]
|
||||
# NonLocal3d y: [N, C, T, H, W]
|
||||
y = y.permute(0, 2, 1).contiguous().reshape(n, self.inter_channels,
|
||||
*x.size()[2:])
|
||||
|
||||
output = x + self.conv_out(y)
|
||||
|
||||
return output
|
||||
|
||||
|
||||
class NonLocal1d(_NonLocalNd):
|
||||
"""1D Non-local module.
|
||||
|
||||
Args:
|
||||
in_channels (int): Same as `NonLocalND`.
|
||||
sub_sample (bool): Whether to apply max pooling after pairwise
|
||||
function (Note that the `sub_sample` is applied on spatial only).
|
||||
Default: False.
|
||||
conv_cfg (None | dict): Same as `NonLocalND`.
|
||||
Default: dict(type='Conv1d').
|
||||
"""
|
||||
|
||||
def __init__(self,
|
||||
in_channels,
|
||||
sub_sample=False,
|
||||
conv_cfg=dict(type='Conv1d'),
|
||||
**kwargs):
|
||||
super(NonLocal1d, self).__init__(
|
||||
in_channels, conv_cfg=conv_cfg, **kwargs)
|
||||
|
||||
self.sub_sample = sub_sample
|
||||
|
||||
if sub_sample:
|
||||
max_pool_layer = nn.MaxPool1d(kernel_size=2)
|
||||
self.g = nn.Sequential(self.g, max_pool_layer)
|
||||
if self.mode != 'gaussian':
|
||||
self.phi = nn.Sequential(self.phi, max_pool_layer)
|
||||
else:
|
||||
self.phi = max_pool_layer
|
||||
|
||||
|
||||
@PLUGIN_LAYERS.register_module()
|
||||
class NonLocal2d(_NonLocalNd):
|
||||
"""2D Non-local module.
|
||||
|
||||
Args:
|
||||
in_channels (int): Same as `NonLocalND`.
|
||||
sub_sample (bool): Whether to apply max pooling after pairwise
|
||||
function (Note that the `sub_sample` is applied on spatial only).
|
||||
Default: False.
|
||||
conv_cfg (None | dict): Same as `NonLocalND`.
|
||||
Default: dict(type='Conv2d').
|
||||
"""
|
||||
|
||||
_abbr_ = 'nonlocal_block'
|
||||
|
||||
def __init__(self,
|
||||
in_channels,
|
||||
sub_sample=False,
|
||||
conv_cfg=dict(type='Conv2d'),
|
||||
**kwargs):
|
||||
super(NonLocal2d, self).__init__(
|
||||
in_channels, conv_cfg=conv_cfg, **kwargs)
|
||||
|
||||
self.sub_sample = sub_sample
|
||||
|
||||
if sub_sample:
|
||||
max_pool_layer = nn.MaxPool2d(kernel_size=(2, 2))
|
||||
self.g = nn.Sequential(self.g, max_pool_layer)
|
||||
if self.mode != 'gaussian':
|
||||
self.phi = nn.Sequential(self.phi, max_pool_layer)
|
||||
else:
|
||||
self.phi = max_pool_layer
|
||||
|
||||
|
||||
class NonLocal3d(_NonLocalNd):
|
||||
"""3D Non-local module.
|
||||
|
||||
Args:
|
||||
in_channels (int): Same as `NonLocalND`.
|
||||
sub_sample (bool): Whether to apply max pooling after pairwise
|
||||
function (Note that the `sub_sample` is applied on spatial only).
|
||||
Default: False.
|
||||
conv_cfg (None | dict): Same as `NonLocalND`.
|
||||
Default: dict(type='Conv3d').
|
||||
"""
|
||||
|
||||
def __init__(self,
|
||||
in_channels,
|
||||
sub_sample=False,
|
||||
conv_cfg=dict(type='Conv3d'),
|
||||
**kwargs):
|
||||
super(NonLocal3d, self).__init__(
|
||||
in_channels, conv_cfg=conv_cfg, **kwargs)
|
||||
self.sub_sample = sub_sample
|
||||
|
||||
if sub_sample:
|
||||
max_pool_layer = nn.MaxPool3d(kernel_size=(1, 2, 2))
|
||||
self.g = nn.Sequential(self.g, max_pool_layer)
|
||||
if self.mode != 'gaussian':
|
||||
self.phi = nn.Sequential(self.phi, max_pool_layer)
|
||||
else:
|
||||
self.phi = max_pool_layer
|
||||
@@ -0,0 +1,144 @@
|
||||
# Copyright (c) OpenMMLab. All rights reserved.
|
||||
import inspect
|
||||
|
||||
import torch.nn as nn
|
||||
|
||||
from annotator.mmpkg.mmcv.utils import is_tuple_of
|
||||
from annotator.mmpkg.mmcv.utils.parrots_wrapper import SyncBatchNorm, _BatchNorm, _InstanceNorm
|
||||
from .registry import NORM_LAYERS
|
||||
|
||||
NORM_LAYERS.register_module('BN', module=nn.BatchNorm2d)
|
||||
NORM_LAYERS.register_module('BN1d', module=nn.BatchNorm1d)
|
||||
NORM_LAYERS.register_module('BN2d', module=nn.BatchNorm2d)
|
||||
NORM_LAYERS.register_module('BN3d', module=nn.BatchNorm3d)
|
||||
NORM_LAYERS.register_module('SyncBN', module=SyncBatchNorm)
|
||||
NORM_LAYERS.register_module('GN', module=nn.GroupNorm)
|
||||
NORM_LAYERS.register_module('LN', module=nn.LayerNorm)
|
||||
NORM_LAYERS.register_module('IN', module=nn.InstanceNorm2d)
|
||||
NORM_LAYERS.register_module('IN1d', module=nn.InstanceNorm1d)
|
||||
NORM_LAYERS.register_module('IN2d', module=nn.InstanceNorm2d)
|
||||
NORM_LAYERS.register_module('IN3d', module=nn.InstanceNorm3d)
|
||||
|
||||
|
||||
def infer_abbr(class_type):
|
||||
"""Infer abbreviation from the class name.
|
||||
|
||||
When we build a norm layer with `build_norm_layer()`, we want to preserve
|
||||
the norm type in variable names, e.g, self.bn1, self.gn. This method will
|
||||
infer the abbreviation to map class types to abbreviations.
|
||||
|
||||
Rule 1: If the class has the property "_abbr_", return the property.
|
||||
Rule 2: If the parent class is _BatchNorm, GroupNorm, LayerNorm or
|
||||
InstanceNorm, the abbreviation of this layer will be "bn", "gn", "ln" and
|
||||
"in" respectively.
|
||||
Rule 3: If the class name contains "batch", "group", "layer" or "instance",
|
||||
the abbreviation of this layer will be "bn", "gn", "ln" and "in"
|
||||
respectively.
|
||||
Rule 4: Otherwise, the abbreviation falls back to "norm".
|
||||
|
||||
Args:
|
||||
class_type (type): The norm layer type.
|
||||
|
||||
Returns:
|
||||
str: The inferred abbreviation.
|
||||
"""
|
||||
if not inspect.isclass(class_type):
|
||||
raise TypeError(
|
||||
f'class_type must be a type, but got {type(class_type)}')
|
||||
if hasattr(class_type, '_abbr_'):
|
||||
return class_type._abbr_
|
||||
if issubclass(class_type, _InstanceNorm): # IN is a subclass of BN
|
||||
return 'in'
|
||||
elif issubclass(class_type, _BatchNorm):
|
||||
return 'bn'
|
||||
elif issubclass(class_type, nn.GroupNorm):
|
||||
return 'gn'
|
||||
elif issubclass(class_type, nn.LayerNorm):
|
||||
return 'ln'
|
||||
else:
|
||||
class_name = class_type.__name__.lower()
|
||||
if 'batch' in class_name:
|
||||
return 'bn'
|
||||
elif 'group' in class_name:
|
||||
return 'gn'
|
||||
elif 'layer' in class_name:
|
||||
return 'ln'
|
||||
elif 'instance' in class_name:
|
||||
return 'in'
|
||||
else:
|
||||
return 'norm_layer'
|
||||
|
||||
|
||||
def build_norm_layer(cfg, num_features, postfix=''):
|
||||
"""Build normalization layer.
|
||||
|
||||
Args:
|
||||
cfg (dict): The norm layer config, which should contain:
|
||||
|
||||
- type (str): Layer type.
|
||||
- layer args: Args needed to instantiate a norm layer.
|
||||
- requires_grad (bool, optional): Whether stop gradient updates.
|
||||
num_features (int): Number of input channels.
|
||||
postfix (int | str): The postfix to be appended into norm abbreviation
|
||||
to create named layer.
|
||||
|
||||
Returns:
|
||||
(str, nn.Module): The first element is the layer name consisting of
|
||||
abbreviation and postfix, e.g., bn1, gn. The second element is the
|
||||
created norm layer.
|
||||
"""
|
||||
if not isinstance(cfg, dict):
|
||||
raise TypeError('cfg must be a dict')
|
||||
if 'type' not in cfg:
|
||||
raise KeyError('the cfg dict must contain the key "type"')
|
||||
cfg_ = cfg.copy()
|
||||
|
||||
layer_type = cfg_.pop('type')
|
||||
if layer_type not in NORM_LAYERS:
|
||||
raise KeyError(f'Unrecognized norm type {layer_type}')
|
||||
|
||||
norm_layer = NORM_LAYERS.get(layer_type)
|
||||
abbr = infer_abbr(norm_layer)
|
||||
|
||||
assert isinstance(postfix, (int, str))
|
||||
name = abbr + str(postfix)
|
||||
|
||||
requires_grad = cfg_.pop('requires_grad', True)
|
||||
cfg_.setdefault('eps', 1e-5)
|
||||
if layer_type != 'GN':
|
||||
layer = norm_layer(num_features, **cfg_)
|
||||
if layer_type == 'SyncBN' and hasattr(layer, '_specify_ddp_gpu_num'):
|
||||
layer._specify_ddp_gpu_num(1)
|
||||
else:
|
||||
assert 'num_groups' in cfg_
|
||||
layer = norm_layer(num_channels=num_features, **cfg_)
|
||||
|
||||
for param in layer.parameters():
|
||||
param.requires_grad = requires_grad
|
||||
|
||||
return name, layer
|
||||
|
||||
|
||||
def is_norm(layer, exclude=None):
|
||||
"""Check if a layer is a normalization layer.
|
||||
|
||||
Args:
|
||||
layer (nn.Module): The layer to be checked.
|
||||
exclude (type | tuple[type]): Types to be excluded.
|
||||
|
||||
Returns:
|
||||
bool: Whether the layer is a norm layer.
|
||||
"""
|
||||
if exclude is not None:
|
||||
if not isinstance(exclude, tuple):
|
||||
exclude = (exclude, )
|
||||
if not is_tuple_of(exclude, type):
|
||||
raise TypeError(
|
||||
f'"exclude" must be either None or type or a tuple of types, '
|
||||
f'but got {type(exclude)}: {exclude}')
|
||||
|
||||
if exclude and isinstance(layer, exclude):
|
||||
return False
|
||||
|
||||
all_norm_bases = (_BatchNorm, _InstanceNorm, nn.GroupNorm, nn.LayerNorm)
|
||||
return isinstance(layer, all_norm_bases)
|
||||
@@ -0,0 +1,36 @@
|
||||
# Copyright (c) OpenMMLab. All rights reserved.
|
||||
import torch.nn as nn
|
||||
|
||||
from .registry import PADDING_LAYERS
|
||||
|
||||
PADDING_LAYERS.register_module('zero', module=nn.ZeroPad2d)
|
||||
PADDING_LAYERS.register_module('reflect', module=nn.ReflectionPad2d)
|
||||
PADDING_LAYERS.register_module('replicate', module=nn.ReplicationPad2d)
|
||||
|
||||
|
||||
def build_padding_layer(cfg, *args, **kwargs):
|
||||
"""Build padding layer.
|
||||
|
||||
Args:
|
||||
cfg (None or dict): The padding layer config, which should contain:
|
||||
- type (str): Layer type.
|
||||
- layer args: Args needed to instantiate a padding layer.
|
||||
|
||||
Returns:
|
||||
nn.Module: Created padding layer.
|
||||
"""
|
||||
if not isinstance(cfg, dict):
|
||||
raise TypeError('cfg must be a dict')
|
||||
if 'type' not in cfg:
|
||||
raise KeyError('the cfg dict must contain the key "type"')
|
||||
|
||||
cfg_ = cfg.copy()
|
||||
padding_type = cfg_.pop('type')
|
||||
if padding_type not in PADDING_LAYERS:
|
||||
raise KeyError(f'Unrecognized padding type {padding_type}.')
|
||||
else:
|
||||
padding_layer = PADDING_LAYERS.get(padding_type)
|
||||
|
||||
layer = padding_layer(*args, **kwargs, **cfg_)
|
||||
|
||||
return layer
|
||||
@@ -0,0 +1,88 @@
|
||||
import inspect
|
||||
import platform
|
||||
|
||||
from .registry import PLUGIN_LAYERS
|
||||
|
||||
if platform.system() == 'Windows':
|
||||
import regex as re
|
||||
else:
|
||||
import re
|
||||
|
||||
|
||||
def infer_abbr(class_type):
|
||||
"""Infer abbreviation from the class name.
|
||||
|
||||
This method will infer the abbreviation to map class types to
|
||||
abbreviations.
|
||||
|
||||
Rule 1: If the class has the property "abbr", return the property.
|
||||
Rule 2: Otherwise, the abbreviation falls back to snake case of class
|
||||
name, e.g. the abbreviation of ``FancyBlock`` will be ``fancy_block``.
|
||||
|
||||
Args:
|
||||
class_type (type): The norm layer type.
|
||||
|
||||
Returns:
|
||||
str: The inferred abbreviation.
|
||||
"""
|
||||
|
||||
def camel2snack(word):
|
||||
"""Convert camel case word into snack case.
|
||||
|
||||
Modified from `inflection lib
|
||||
<https://inflection.readthedocs.io/en/latest/#inflection.underscore>`_.
|
||||
|
||||
Example::
|
||||
|
||||
>>> camel2snack("FancyBlock")
|
||||
'fancy_block'
|
||||
"""
|
||||
|
||||
word = re.sub(r'([A-Z]+)([A-Z][a-z])', r'\1_\2', word)
|
||||
word = re.sub(r'([a-z\d])([A-Z])', r'\1_\2', word)
|
||||
word = word.replace('-', '_')
|
||||
return word.lower()
|
||||
|
||||
if not inspect.isclass(class_type):
|
||||
raise TypeError(
|
||||
f'class_type must be a type, but got {type(class_type)}')
|
||||
if hasattr(class_type, '_abbr_'):
|
||||
return class_type._abbr_
|
||||
else:
|
||||
return camel2snack(class_type.__name__)
|
||||
|
||||
|
||||
def build_plugin_layer(cfg, postfix='', **kwargs):
|
||||
"""Build plugin layer.
|
||||
|
||||
Args:
|
||||
cfg (None or dict): cfg should contain:
|
||||
type (str): identify plugin layer type.
|
||||
layer args: args needed to instantiate a plugin layer.
|
||||
postfix (int, str): appended into norm abbreviation to
|
||||
create named layer. Default: ''.
|
||||
|
||||
Returns:
|
||||
tuple[str, nn.Module]:
|
||||
name (str): abbreviation + postfix
|
||||
layer (nn.Module): created plugin layer
|
||||
"""
|
||||
if not isinstance(cfg, dict):
|
||||
raise TypeError('cfg must be a dict')
|
||||
if 'type' not in cfg:
|
||||
raise KeyError('the cfg dict must contain the key "type"')
|
||||
cfg_ = cfg.copy()
|
||||
|
||||
layer_type = cfg_.pop('type')
|
||||
if layer_type not in PLUGIN_LAYERS:
|
||||
raise KeyError(f'Unrecognized plugin type {layer_type}')
|
||||
|
||||
plugin_layer = PLUGIN_LAYERS.get(layer_type)
|
||||
abbr = infer_abbr(plugin_layer)
|
||||
|
||||
assert isinstance(postfix, (int, str))
|
||||
name = abbr + str(postfix)
|
||||
|
||||
layer = plugin_layer(**kwargs, **cfg_)
|
||||
|
||||
return name, layer
|
||||
@@ -0,0 +1,16 @@
|
||||
# Copyright (c) OpenMMLab. All rights reserved.
|
||||
from annotator.mmpkg.mmcv.utils import Registry
|
||||
|
||||
CONV_LAYERS = Registry('conv layer')
|
||||
NORM_LAYERS = Registry('norm layer')
|
||||
ACTIVATION_LAYERS = Registry('activation layer')
|
||||
PADDING_LAYERS = Registry('padding layer')
|
||||
UPSAMPLE_LAYERS = Registry('upsample layer')
|
||||
PLUGIN_LAYERS = Registry('plugin layer')
|
||||
|
||||
DROPOUT_LAYERS = Registry('drop out layers')
|
||||
POSITIONAL_ENCODING = Registry('position encoding')
|
||||
ATTENTION = Registry('attention')
|
||||
FEEDFORWARD_NETWORK = Registry('feed-forward Network')
|
||||
TRANSFORMER_LAYER = Registry('transformerLayer')
|
||||
TRANSFORMER_LAYER_SEQUENCE = Registry('transformer-layers sequence')
|
||||
@@ -0,0 +1,21 @@
|
||||
# Copyright (c) OpenMMLab. All rights reserved.
|
||||
import torch
|
||||
import torch.nn as nn
|
||||
|
||||
|
||||
class Scale(nn.Module):
|
||||
"""A learnable scale parameter.
|
||||
|
||||
This layer scales the input by a learnable factor. It multiplies a
|
||||
learnable scale parameter of shape (1,) with input of any shape.
|
||||
|
||||
Args:
|
||||
scale (float): Initial value of scale factor. Default: 1.0
|
||||
"""
|
||||
|
||||
def __init__(self, scale=1.0):
|
||||
super(Scale, self).__init__()
|
||||
self.scale = nn.Parameter(torch.tensor(scale, dtype=torch.float))
|
||||
|
||||
def forward(self, x):
|
||||
return x * self.scale
|
||||
@@ -0,0 +1,25 @@
|
||||
# Copyright (c) OpenMMLab. All rights reserved.
|
||||
import torch
|
||||
import torch.nn as nn
|
||||
|
||||
from .registry import ACTIVATION_LAYERS
|
||||
|
||||
|
||||
@ACTIVATION_LAYERS.register_module()
|
||||
class Swish(nn.Module):
|
||||
"""Swish Module.
|
||||
|
||||
This module applies the swish function:
|
||||
|
||||
.. math::
|
||||
Swish(x) = x * Sigmoid(x)
|
||||
|
||||
Returns:
|
||||
Tensor: The output tensor.
|
||||
"""
|
||||
|
||||
def __init__(self):
|
||||
super(Swish, self).__init__()
|
||||
|
||||
def forward(self, x):
|
||||
return x * torch.sigmoid(x)
|
||||
@@ -0,0 +1,595 @@
|
||||
# Copyright (c) OpenMMLab. All rights reserved.
|
||||
import copy
|
||||
import warnings
|
||||
|
||||
import torch
|
||||
import torch.nn as nn
|
||||
|
||||
from annotator.mmpkg.mmcv import ConfigDict, deprecated_api_warning
|
||||
from annotator.mmpkg.mmcv.cnn import Linear, build_activation_layer, build_norm_layer
|
||||
from annotator.mmpkg.mmcv.runner.base_module import BaseModule, ModuleList, Sequential
|
||||
from annotator.mmpkg.mmcv.utils import build_from_cfg
|
||||
from .drop import build_dropout
|
||||
from .registry import (ATTENTION, FEEDFORWARD_NETWORK, POSITIONAL_ENCODING,
|
||||
TRANSFORMER_LAYER, TRANSFORMER_LAYER_SEQUENCE)
|
||||
|
||||
# Avoid BC-breaking of importing MultiScaleDeformableAttention from this file
|
||||
try:
|
||||
from annotator.mmpkg.mmcv.ops.multi_scale_deform_attn import MultiScaleDeformableAttention # noqa F401
|
||||
warnings.warn(
|
||||
ImportWarning(
|
||||
'``MultiScaleDeformableAttention`` has been moved to '
|
||||
'``mmcv.ops.multi_scale_deform_attn``, please change original path ' # noqa E501
|
||||
'``from annotator.mmpkg.mmcv.cnn.bricks.transformer import MultiScaleDeformableAttention`` ' # noqa E501
|
||||
'to ``from annotator.mmpkg.mmcv.ops.multi_scale_deform_attn import MultiScaleDeformableAttention`` ' # noqa E501
|
||||
))
|
||||
|
||||
except ImportError:
|
||||
warnings.warn('Fail to import ``MultiScaleDeformableAttention`` from '
|
||||
'``mmcv.ops.multi_scale_deform_attn``, '
|
||||
'You should install ``mmcv-full`` if you need this module. ')
|
||||
|
||||
|
||||
def build_positional_encoding(cfg, default_args=None):
|
||||
"""Builder for Position Encoding."""
|
||||
return build_from_cfg(cfg, POSITIONAL_ENCODING, default_args)
|
||||
|
||||
|
||||
def build_attention(cfg, default_args=None):
|
||||
"""Builder for attention."""
|
||||
return build_from_cfg(cfg, ATTENTION, default_args)
|
||||
|
||||
|
||||
def build_feedforward_network(cfg, default_args=None):
|
||||
"""Builder for feed-forward network (FFN)."""
|
||||
return build_from_cfg(cfg, FEEDFORWARD_NETWORK, default_args)
|
||||
|
||||
|
||||
def build_transformer_layer(cfg, default_args=None):
|
||||
"""Builder for transformer layer."""
|
||||
return build_from_cfg(cfg, TRANSFORMER_LAYER, default_args)
|
||||
|
||||
|
||||
def build_transformer_layer_sequence(cfg, default_args=None):
|
||||
"""Builder for transformer encoder and transformer decoder."""
|
||||
return build_from_cfg(cfg, TRANSFORMER_LAYER_SEQUENCE, default_args)
|
||||
|
||||
|
||||
@ATTENTION.register_module()
|
||||
class MultiheadAttention(BaseModule):
|
||||
"""A wrapper for ``torch.nn.MultiheadAttention``.
|
||||
|
||||
This module implements MultiheadAttention with identity connection,
|
||||
and positional encoding is also passed as input.
|
||||
|
||||
Args:
|
||||
embed_dims (int): The embedding dimension.
|
||||
num_heads (int): Parallel attention heads.
|
||||
attn_drop (float): A Dropout layer on attn_output_weights.
|
||||
Default: 0.0.
|
||||
proj_drop (float): A Dropout layer after `nn.MultiheadAttention`.
|
||||
Default: 0.0.
|
||||
dropout_layer (obj:`ConfigDict`): The dropout_layer used
|
||||
when adding the shortcut.
|
||||
init_cfg (obj:`mmcv.ConfigDict`): The Config for initialization.
|
||||
Default: None.
|
||||
batch_first (bool): When it is True, Key, Query and Value are shape of
|
||||
(batch, n, embed_dim), otherwise (n, batch, embed_dim).
|
||||
Default to False.
|
||||
"""
|
||||
|
||||
def __init__(self,
|
||||
embed_dims,
|
||||
num_heads,
|
||||
attn_drop=0.,
|
||||
proj_drop=0.,
|
||||
dropout_layer=dict(type='Dropout', drop_prob=0.),
|
||||
init_cfg=None,
|
||||
batch_first=False,
|
||||
**kwargs):
|
||||
super(MultiheadAttention, self).__init__(init_cfg)
|
||||
if 'dropout' in kwargs:
|
||||
warnings.warn('The arguments `dropout` in MultiheadAttention '
|
||||
'has been deprecated, now you can separately '
|
||||
'set `attn_drop`(float), proj_drop(float), '
|
||||
'and `dropout_layer`(dict) ')
|
||||
attn_drop = kwargs['dropout']
|
||||
dropout_layer['drop_prob'] = kwargs.pop('dropout')
|
||||
|
||||
self.embed_dims = embed_dims
|
||||
self.num_heads = num_heads
|
||||
self.batch_first = batch_first
|
||||
|
||||
self.attn = nn.MultiheadAttention(embed_dims, num_heads, attn_drop,
|
||||
**kwargs)
|
||||
|
||||
self.proj_drop = nn.Dropout(proj_drop)
|
||||
self.dropout_layer = build_dropout(
|
||||
dropout_layer) if dropout_layer else nn.Identity()
|
||||
|
||||
@deprecated_api_warning({'residual': 'identity'},
|
||||
cls_name='MultiheadAttention')
|
||||
def forward(self,
|
||||
query,
|
||||
key=None,
|
||||
value=None,
|
||||
identity=None,
|
||||
query_pos=None,
|
||||
key_pos=None,
|
||||
attn_mask=None,
|
||||
key_padding_mask=None,
|
||||
**kwargs):
|
||||
"""Forward function for `MultiheadAttention`.
|
||||
|
||||
**kwargs allow passing a more general data flow when combining
|
||||
with other operations in `transformerlayer`.
|
||||
|
||||
Args:
|
||||
query (Tensor): The input query with shape [num_queries, bs,
|
||||
embed_dims] if self.batch_first is False, else
|
||||
[bs, num_queries embed_dims].
|
||||
key (Tensor): The key tensor with shape [num_keys, bs,
|
||||
embed_dims] if self.batch_first is False, else
|
||||
[bs, num_keys, embed_dims] .
|
||||
If None, the ``query`` will be used. Defaults to None.
|
||||
value (Tensor): The value tensor with same shape as `key`.
|
||||
Same in `nn.MultiheadAttention.forward`. Defaults to None.
|
||||
If None, the `key` will be used.
|
||||
identity (Tensor): This tensor, with the same shape as x,
|
||||
will be used for the identity link.
|
||||
If None, `x` will be used. Defaults to None.
|
||||
query_pos (Tensor): The positional encoding for query, with
|
||||
the same shape as `x`. If not None, it will
|
||||
be added to `x` before forward function. Defaults to None.
|
||||
key_pos (Tensor): The positional encoding for `key`, with the
|
||||
same shape as `key`. Defaults to None. If not None, it will
|
||||
be added to `key` before forward function. If None, and
|
||||
`query_pos` has the same shape as `key`, then `query_pos`
|
||||
will be used for `key_pos`. Defaults to None.
|
||||
attn_mask (Tensor): ByteTensor mask with shape [num_queries,
|
||||
num_keys]. Same in `nn.MultiheadAttention.forward`.
|
||||
Defaults to None.
|
||||
key_padding_mask (Tensor): ByteTensor with shape [bs, num_keys].
|
||||
Defaults to None.
|
||||
|
||||
Returns:
|
||||
Tensor: forwarded results with shape
|
||||
[num_queries, bs, embed_dims]
|
||||
if self.batch_first is False, else
|
||||
[bs, num_queries embed_dims].
|
||||
"""
|
||||
|
||||
if key is None:
|
||||
key = query
|
||||
if value is None:
|
||||
value = key
|
||||
if identity is None:
|
||||
identity = query
|
||||
if key_pos is None:
|
||||
if query_pos is not None:
|
||||
# use query_pos if key_pos is not available
|
||||
if query_pos.shape == key.shape:
|
||||
key_pos = query_pos
|
||||
else:
|
||||
warnings.warn(f'position encoding of key is'
|
||||
f'missing in {self.__class__.__name__}.')
|
||||
if query_pos is not None:
|
||||
query = query + query_pos
|
||||
if key_pos is not None:
|
||||
key = key + key_pos
|
||||
|
||||
# Because the dataflow('key', 'query', 'value') of
|
||||
# ``torch.nn.MultiheadAttention`` is (num_query, batch,
|
||||
# embed_dims), We should adjust the shape of dataflow from
|
||||
# batch_first (batch, num_query, embed_dims) to num_query_first
|
||||
# (num_query ,batch, embed_dims), and recover ``attn_output``
|
||||
# from num_query_first to batch_first.
|
||||
if self.batch_first:
|
||||
query = query.transpose(0, 1)
|
||||
key = key.transpose(0, 1)
|
||||
value = value.transpose(0, 1)
|
||||
|
||||
out = self.attn(
|
||||
query=query,
|
||||
key=key,
|
||||
value=value,
|
||||
attn_mask=attn_mask,
|
||||
key_padding_mask=key_padding_mask)[0]
|
||||
|
||||
if self.batch_first:
|
||||
out = out.transpose(0, 1)
|
||||
|
||||
return identity + self.dropout_layer(self.proj_drop(out))
|
||||
|
||||
|
||||
@FEEDFORWARD_NETWORK.register_module()
|
||||
class FFN(BaseModule):
|
||||
"""Implements feed-forward networks (FFNs) with identity connection.
|
||||
|
||||
Args:
|
||||
embed_dims (int): The feature dimension. Same as
|
||||
`MultiheadAttention`. Defaults: 256.
|
||||
feedforward_channels (int): The hidden dimension of FFNs.
|
||||
Defaults: 1024.
|
||||
num_fcs (int, optional): The number of fully-connected layers in
|
||||
FFNs. Default: 2.
|
||||
act_cfg (dict, optional): The activation config for FFNs.
|
||||
Default: dict(type='ReLU')
|
||||
ffn_drop (float, optional): Probability of an element to be
|
||||
zeroed in FFN. Default 0.0.
|
||||
add_identity (bool, optional): Whether to add the
|
||||
identity connection. Default: `True`.
|
||||
dropout_layer (obj:`ConfigDict`): The dropout_layer used
|
||||
when adding the shortcut.
|
||||
init_cfg (obj:`mmcv.ConfigDict`): The Config for initialization.
|
||||
Default: None.
|
||||
"""
|
||||
|
||||
@deprecated_api_warning(
|
||||
{
|
||||
'dropout': 'ffn_drop',
|
||||
'add_residual': 'add_identity'
|
||||
},
|
||||
cls_name='FFN')
|
||||
def __init__(self,
|
||||
embed_dims=256,
|
||||
feedforward_channels=1024,
|
||||
num_fcs=2,
|
||||
act_cfg=dict(type='ReLU', inplace=True),
|
||||
ffn_drop=0.,
|
||||
dropout_layer=None,
|
||||
add_identity=True,
|
||||
init_cfg=None,
|
||||
**kwargs):
|
||||
super(FFN, self).__init__(init_cfg)
|
||||
assert num_fcs >= 2, 'num_fcs should be no less ' \
|
||||
f'than 2. got {num_fcs}.'
|
||||
self.embed_dims = embed_dims
|
||||
self.feedforward_channels = feedforward_channels
|
||||
self.num_fcs = num_fcs
|
||||
self.act_cfg = act_cfg
|
||||
self.activate = build_activation_layer(act_cfg)
|
||||
|
||||
layers = []
|
||||
in_channels = embed_dims
|
||||
for _ in range(num_fcs - 1):
|
||||
layers.append(
|
||||
Sequential(
|
||||
Linear(in_channels, feedforward_channels), self.activate,
|
||||
nn.Dropout(ffn_drop)))
|
||||
in_channels = feedforward_channels
|
||||
layers.append(Linear(feedforward_channels, embed_dims))
|
||||
layers.append(nn.Dropout(ffn_drop))
|
||||
self.layers = Sequential(*layers)
|
||||
self.dropout_layer = build_dropout(
|
||||
dropout_layer) if dropout_layer else torch.nn.Identity()
|
||||
self.add_identity = add_identity
|
||||
|
||||
@deprecated_api_warning({'residual': 'identity'}, cls_name='FFN')
|
||||
def forward(self, x, identity=None):
|
||||
"""Forward function for `FFN`.
|
||||
|
||||
The function would add x to the output tensor if residue is None.
|
||||
"""
|
||||
out = self.layers(x)
|
||||
if not self.add_identity:
|
||||
return self.dropout_layer(out)
|
||||
if identity is None:
|
||||
identity = x
|
||||
return identity + self.dropout_layer(out)
|
||||
|
||||
|
||||
@TRANSFORMER_LAYER.register_module()
|
||||
class BaseTransformerLayer(BaseModule):
|
||||
"""Base `TransformerLayer` for vision transformer.
|
||||
|
||||
It can be built from `mmcv.ConfigDict` and support more flexible
|
||||
customization, for example, using any number of `FFN or LN ` and
|
||||
use different kinds of `attention` by specifying a list of `ConfigDict`
|
||||
named `attn_cfgs`. It is worth mentioning that it supports `prenorm`
|
||||
when you specifying `norm` as the first element of `operation_order`.
|
||||
More details about the `prenorm`: `On Layer Normalization in the
|
||||
Transformer Architecture <https://arxiv.org/abs/2002.04745>`_ .
|
||||
|
||||
Args:
|
||||
attn_cfgs (list[`mmcv.ConfigDict`] | obj:`mmcv.ConfigDict` | None )):
|
||||
Configs for `self_attention` or `cross_attention` modules,
|
||||
The order of the configs in the list should be consistent with
|
||||
corresponding attentions in operation_order.
|
||||
If it is a dict, all of the attention modules in operation_order
|
||||
will be built with this config. Default: None.
|
||||
ffn_cfgs (list[`mmcv.ConfigDict`] | obj:`mmcv.ConfigDict` | None )):
|
||||
Configs for FFN, The order of the configs in the list should be
|
||||
consistent with corresponding ffn in operation_order.
|
||||
If it is a dict, all of the attention modules in operation_order
|
||||
will be built with this config.
|
||||
operation_order (tuple[str]): The execution order of operation
|
||||
in transformer. Such as ('self_attn', 'norm', 'ffn', 'norm').
|
||||
Support `prenorm` when you specifying first element as `norm`.
|
||||
Default:None.
|
||||
norm_cfg (dict): Config dict for normalization layer.
|
||||
Default: dict(type='LN').
|
||||
init_cfg (obj:`mmcv.ConfigDict`): The Config for initialization.
|
||||
Default: None.
|
||||
batch_first (bool): Key, Query and Value are shape
|
||||
of (batch, n, embed_dim)
|
||||
or (n, batch, embed_dim). Default to False.
|
||||
"""
|
||||
|
||||
def __init__(self,
|
||||
attn_cfgs=None,
|
||||
ffn_cfgs=dict(
|
||||
type='FFN',
|
||||
embed_dims=256,
|
||||
feedforward_channels=1024,
|
||||
num_fcs=2,
|
||||
ffn_drop=0.,
|
||||
act_cfg=dict(type='ReLU', inplace=True),
|
||||
),
|
||||
operation_order=None,
|
||||
norm_cfg=dict(type='LN'),
|
||||
init_cfg=None,
|
||||
batch_first=False,
|
||||
**kwargs):
|
||||
|
||||
deprecated_args = dict(
|
||||
feedforward_channels='feedforward_channels',
|
||||
ffn_dropout='ffn_drop',
|
||||
ffn_num_fcs='num_fcs')
|
||||
for ori_name, new_name in deprecated_args.items():
|
||||
if ori_name in kwargs:
|
||||
warnings.warn(
|
||||
f'The arguments `{ori_name}` in BaseTransformerLayer '
|
||||
f'has been deprecated, now you should set `{new_name}` '
|
||||
f'and other FFN related arguments '
|
||||
f'to a dict named `ffn_cfgs`. ')
|
||||
ffn_cfgs[new_name] = kwargs[ori_name]
|
||||
|
||||
super(BaseTransformerLayer, self).__init__(init_cfg)
|
||||
|
||||
self.batch_first = batch_first
|
||||
|
||||
assert set(operation_order) & set(
|
||||
['self_attn', 'norm', 'ffn', 'cross_attn']) == \
|
||||
set(operation_order), f'The operation_order of' \
|
||||
f' {self.__class__.__name__} should ' \
|
||||
f'contains all four operation type ' \
|
||||
f"{['self_attn', 'norm', 'ffn', 'cross_attn']}"
|
||||
|
||||
num_attn = operation_order.count('self_attn') + operation_order.count(
|
||||
'cross_attn')
|
||||
if isinstance(attn_cfgs, dict):
|
||||
attn_cfgs = [copy.deepcopy(attn_cfgs) for _ in range(num_attn)]
|
||||
else:
|
||||
assert num_attn == len(attn_cfgs), f'The length ' \
|
||||
f'of attn_cfg {num_attn} is ' \
|
||||
f'not consistent with the number of attention' \
|
||||
f'in operation_order {operation_order}.'
|
||||
|
||||
self.num_attn = num_attn
|
||||
self.operation_order = operation_order
|
||||
self.norm_cfg = norm_cfg
|
||||
self.pre_norm = operation_order[0] == 'norm'
|
||||
self.attentions = ModuleList()
|
||||
|
||||
index = 0
|
||||
for operation_name in operation_order:
|
||||
if operation_name in ['self_attn', 'cross_attn']:
|
||||
if 'batch_first' in attn_cfgs[index]:
|
||||
assert self.batch_first == attn_cfgs[index]['batch_first']
|
||||
else:
|
||||
attn_cfgs[index]['batch_first'] = self.batch_first
|
||||
attention = build_attention(attn_cfgs[index])
|
||||
# Some custom attentions used as `self_attn`
|
||||
# or `cross_attn` can have different behavior.
|
||||
attention.operation_name = operation_name
|
||||
self.attentions.append(attention)
|
||||
index += 1
|
||||
|
||||
self.embed_dims = self.attentions[0].embed_dims
|
||||
|
||||
self.ffns = ModuleList()
|
||||
num_ffns = operation_order.count('ffn')
|
||||
if isinstance(ffn_cfgs, dict):
|
||||
ffn_cfgs = ConfigDict(ffn_cfgs)
|
||||
if isinstance(ffn_cfgs, dict):
|
||||
ffn_cfgs = [copy.deepcopy(ffn_cfgs) for _ in range(num_ffns)]
|
||||
assert len(ffn_cfgs) == num_ffns
|
||||
for ffn_index in range(num_ffns):
|
||||
if 'embed_dims' not in ffn_cfgs[ffn_index]:
|
||||
ffn_cfgs['embed_dims'] = self.embed_dims
|
||||
else:
|
||||
assert ffn_cfgs[ffn_index]['embed_dims'] == self.embed_dims
|
||||
self.ffns.append(
|
||||
build_feedforward_network(ffn_cfgs[ffn_index],
|
||||
dict(type='FFN')))
|
||||
|
||||
self.norms = ModuleList()
|
||||
num_norms = operation_order.count('norm')
|
||||
for _ in range(num_norms):
|
||||
self.norms.append(build_norm_layer(norm_cfg, self.embed_dims)[1])
|
||||
|
||||
def forward(self,
|
||||
query,
|
||||
key=None,
|
||||
value=None,
|
||||
query_pos=None,
|
||||
key_pos=None,
|
||||
attn_masks=None,
|
||||
query_key_padding_mask=None,
|
||||
key_padding_mask=None,
|
||||
**kwargs):
|
||||
"""Forward function for `TransformerDecoderLayer`.
|
||||
|
||||
**kwargs contains some specific arguments of attentions.
|
||||
|
||||
Args:
|
||||
query (Tensor): The input query with shape
|
||||
[num_queries, bs, embed_dims] if
|
||||
self.batch_first is False, else
|
||||
[bs, num_queries embed_dims].
|
||||
key (Tensor): The key tensor with shape [num_keys, bs,
|
||||
embed_dims] if self.batch_first is False, else
|
||||
[bs, num_keys, embed_dims] .
|
||||
value (Tensor): The value tensor with same shape as `key`.
|
||||
query_pos (Tensor): The positional encoding for `query`.
|
||||
Default: None.
|
||||
key_pos (Tensor): The positional encoding for `key`.
|
||||
Default: None.
|
||||
attn_masks (List[Tensor] | None): 2D Tensor used in
|
||||
calculation of corresponding attention. The length of
|
||||
it should equal to the number of `attention` in
|
||||
`operation_order`. Default: None.
|
||||
query_key_padding_mask (Tensor): ByteTensor for `query`, with
|
||||
shape [bs, num_queries]. Only used in `self_attn` layer.
|
||||
Defaults to None.
|
||||
key_padding_mask (Tensor): ByteTensor for `query`, with
|
||||
shape [bs, num_keys]. Default: None.
|
||||
|
||||
Returns:
|
||||
Tensor: forwarded results with shape [num_queries, bs, embed_dims].
|
||||
"""
|
||||
|
||||
norm_index = 0
|
||||
attn_index = 0
|
||||
ffn_index = 0
|
||||
identity = query
|
||||
if attn_masks is None:
|
||||
attn_masks = [None for _ in range(self.num_attn)]
|
||||
elif isinstance(attn_masks, torch.Tensor):
|
||||
attn_masks = [
|
||||
copy.deepcopy(attn_masks) for _ in range(self.num_attn)
|
||||
]
|
||||
warnings.warn(f'Use same attn_mask in all attentions in '
|
||||
f'{self.__class__.__name__} ')
|
||||
else:
|
||||
assert len(attn_masks) == self.num_attn, f'The length of ' \
|
||||
f'attn_masks {len(attn_masks)} must be equal ' \
|
||||
f'to the number of attention in ' \
|
||||
f'operation_order {self.num_attn}'
|
||||
|
||||
for layer in self.operation_order:
|
||||
if layer == 'self_attn':
|
||||
temp_key = temp_value = query
|
||||
query = self.attentions[attn_index](
|
||||
query,
|
||||
temp_key,
|
||||
temp_value,
|
||||
identity if self.pre_norm else None,
|
||||
query_pos=query_pos,
|
||||
key_pos=query_pos,
|
||||
attn_mask=attn_masks[attn_index],
|
||||
key_padding_mask=query_key_padding_mask,
|
||||
**kwargs)
|
||||
attn_index += 1
|
||||
identity = query
|
||||
|
||||
elif layer == 'norm':
|
||||
query = self.norms[norm_index](query)
|
||||
norm_index += 1
|
||||
|
||||
elif layer == 'cross_attn':
|
||||
query = self.attentions[attn_index](
|
||||
query,
|
||||
key,
|
||||
value,
|
||||
identity if self.pre_norm else None,
|
||||
query_pos=query_pos,
|
||||
key_pos=key_pos,
|
||||
attn_mask=attn_masks[attn_index],
|
||||
key_padding_mask=key_padding_mask,
|
||||
**kwargs)
|
||||
attn_index += 1
|
||||
identity = query
|
||||
|
||||
elif layer == 'ffn':
|
||||
query = self.ffns[ffn_index](
|
||||
query, identity if self.pre_norm else None)
|
||||
ffn_index += 1
|
||||
|
||||
return query
|
||||
|
||||
|
||||
@TRANSFORMER_LAYER_SEQUENCE.register_module()
|
||||
class TransformerLayerSequence(BaseModule):
|
||||
"""Base class for TransformerEncoder and TransformerDecoder in vision
|
||||
transformer.
|
||||
|
||||
As base-class of Encoder and Decoder in vision transformer.
|
||||
Support customization such as specifying different kind
|
||||
of `transformer_layer` in `transformer_coder`.
|
||||
|
||||
Args:
|
||||
transformerlayer (list[obj:`mmcv.ConfigDict`] |
|
||||
obj:`mmcv.ConfigDict`): Config of transformerlayer
|
||||
in TransformerCoder. If it is obj:`mmcv.ConfigDict`,
|
||||
it would be repeated `num_layer` times to a
|
||||
list[`mmcv.ConfigDict`]. Default: None.
|
||||
num_layers (int): The number of `TransformerLayer`. Default: None.
|
||||
init_cfg (obj:`mmcv.ConfigDict`): The Config for initialization.
|
||||
Default: None.
|
||||
"""
|
||||
|
||||
def __init__(self, transformerlayers=None, num_layers=None, init_cfg=None):
|
||||
super(TransformerLayerSequence, self).__init__(init_cfg)
|
||||
if isinstance(transformerlayers, dict):
|
||||
transformerlayers = [
|
||||
copy.deepcopy(transformerlayers) for _ in range(num_layers)
|
||||
]
|
||||
else:
|
||||
assert isinstance(transformerlayers, list) and \
|
||||
len(transformerlayers) == num_layers
|
||||
self.num_layers = num_layers
|
||||
self.layers = ModuleList()
|
||||
for i in range(num_layers):
|
||||
self.layers.append(build_transformer_layer(transformerlayers[i]))
|
||||
self.embed_dims = self.layers[0].embed_dims
|
||||
self.pre_norm = self.layers[0].pre_norm
|
||||
|
||||
def forward(self,
|
||||
query,
|
||||
key,
|
||||
value,
|
||||
query_pos=None,
|
||||
key_pos=None,
|
||||
attn_masks=None,
|
||||
query_key_padding_mask=None,
|
||||
key_padding_mask=None,
|
||||
**kwargs):
|
||||
"""Forward function for `TransformerCoder`.
|
||||
|
||||
Args:
|
||||
query (Tensor): Input query with shape
|
||||
`(num_queries, bs, embed_dims)`.
|
||||
key (Tensor): The key tensor with shape
|
||||
`(num_keys, bs, embed_dims)`.
|
||||
value (Tensor): The value tensor with shape
|
||||
`(num_keys, bs, embed_dims)`.
|
||||
query_pos (Tensor): The positional encoding for `query`.
|
||||
Default: None.
|
||||
key_pos (Tensor): The positional encoding for `key`.
|
||||
Default: None.
|
||||
attn_masks (List[Tensor], optional): Each element is 2D Tensor
|
||||
which is used in calculation of corresponding attention in
|
||||
operation_order. Default: None.
|
||||
query_key_padding_mask (Tensor): ByteTensor for `query`, with
|
||||
shape [bs, num_queries]. Only used in self-attention
|
||||
Default: None.
|
||||
key_padding_mask (Tensor): ByteTensor for `query`, with
|
||||
shape [bs, num_keys]. Default: None.
|
||||
|
||||
Returns:
|
||||
Tensor: results with shape [num_queries, bs, embed_dims].
|
||||
"""
|
||||
for layer in self.layers:
|
||||
query = layer(
|
||||
query,
|
||||
key,
|
||||
value,
|
||||
query_pos=query_pos,
|
||||
key_pos=key_pos,
|
||||
attn_masks=attn_masks,
|
||||
query_key_padding_mask=query_key_padding_mask,
|
||||
key_padding_mask=key_padding_mask,
|
||||
**kwargs)
|
||||
return query
|
||||
@@ -0,0 +1,84 @@
|
||||
# Copyright (c) OpenMMLab. All rights reserved.
|
||||
import torch.nn as nn
|
||||
import torch.nn.functional as F
|
||||
|
||||
from ..utils import xavier_init
|
||||
from .registry import UPSAMPLE_LAYERS
|
||||
|
||||
UPSAMPLE_LAYERS.register_module('nearest', module=nn.Upsample)
|
||||
UPSAMPLE_LAYERS.register_module('bilinear', module=nn.Upsample)
|
||||
|
||||
|
||||
@UPSAMPLE_LAYERS.register_module(name='pixel_shuffle')
|
||||
class PixelShufflePack(nn.Module):
|
||||
"""Pixel Shuffle upsample layer.
|
||||
|
||||
This module packs `F.pixel_shuffle()` and a nn.Conv2d module together to
|
||||
achieve a simple upsampling with pixel shuffle.
|
||||
|
||||
Args:
|
||||
in_channels (int): Number of input channels.
|
||||
out_channels (int): Number of output channels.
|
||||
scale_factor (int): Upsample ratio.
|
||||
upsample_kernel (int): Kernel size of the conv layer to expand the
|
||||
channels.
|
||||
"""
|
||||
|
||||
def __init__(self, in_channels, out_channels, scale_factor,
|
||||
upsample_kernel):
|
||||
super(PixelShufflePack, self).__init__()
|
||||
self.in_channels = in_channels
|
||||
self.out_channels = out_channels
|
||||
self.scale_factor = scale_factor
|
||||
self.upsample_kernel = upsample_kernel
|
||||
self.upsample_conv = nn.Conv2d(
|
||||
self.in_channels,
|
||||
self.out_channels * scale_factor * scale_factor,
|
||||
self.upsample_kernel,
|
||||
padding=(self.upsample_kernel - 1) // 2)
|
||||
self.init_weights()
|
||||
|
||||
def init_weights(self):
|
||||
xavier_init(self.upsample_conv, distribution='uniform')
|
||||
|
||||
def forward(self, x):
|
||||
x = self.upsample_conv(x)
|
||||
x = F.pixel_shuffle(x, self.scale_factor)
|
||||
return x
|
||||
|
||||
|
||||
def build_upsample_layer(cfg, *args, **kwargs):
|
||||
"""Build upsample layer.
|
||||
|
||||
Args:
|
||||
cfg (dict): The upsample layer config, which should contain:
|
||||
|
||||
- type (str): Layer type.
|
||||
- scale_factor (int): Upsample ratio, which is not applicable to
|
||||
deconv.
|
||||
- layer args: Args needed to instantiate a upsample layer.
|
||||
args (argument list): Arguments passed to the ``__init__``
|
||||
method of the corresponding conv layer.
|
||||
kwargs (keyword arguments): Keyword arguments passed to the
|
||||
``__init__`` method of the corresponding conv layer.
|
||||
|
||||
Returns:
|
||||
nn.Module: Created upsample layer.
|
||||
"""
|
||||
if not isinstance(cfg, dict):
|
||||
raise TypeError(f'cfg must be a dict, but got {type(cfg)}')
|
||||
if 'type' not in cfg:
|
||||
raise KeyError(
|
||||
f'the cfg dict must contain the key "type", but got {cfg}')
|
||||
cfg_ = cfg.copy()
|
||||
|
||||
layer_type = cfg_.pop('type')
|
||||
if layer_type not in UPSAMPLE_LAYERS:
|
||||
raise KeyError(f'Unrecognized upsample type {layer_type}')
|
||||
else:
|
||||
upsample = UPSAMPLE_LAYERS.get(layer_type)
|
||||
|
||||
if upsample is nn.Upsample:
|
||||
cfg_['mode'] = layer_type
|
||||
layer = upsample(*args, **kwargs, **cfg_)
|
||||
return layer
|
||||
@@ -0,0 +1,180 @@
|
||||
# Copyright (c) OpenMMLab. All rights reserved.
|
||||
r"""Modified from https://github.com/facebookresearch/detectron2/blob/master/detectron2/layers/wrappers.py # noqa: E501
|
||||
|
||||
Wrap some nn modules to support empty tensor input. Currently, these wrappers
|
||||
are mainly used in mask heads like fcn_mask_head and maskiou_heads since mask
|
||||
heads are trained on only positive RoIs.
|
||||
"""
|
||||
import math
|
||||
|
||||
import torch
|
||||
import torch.nn as nn
|
||||
from torch.nn.modules.utils import _pair, _triple
|
||||
|
||||
from .registry import CONV_LAYERS, UPSAMPLE_LAYERS
|
||||
|
||||
if torch.__version__ == 'parrots':
|
||||
TORCH_VERSION = torch.__version__
|
||||
else:
|
||||
# torch.__version__ could be 1.3.1+cu92, we only need the first two
|
||||
# for comparison
|
||||
TORCH_VERSION = tuple(int(x) for x in torch.__version__.split('.')[:2])
|
||||
|
||||
|
||||
def obsolete_torch_version(torch_version, version_threshold):
|
||||
return torch_version == 'parrots' or torch_version <= version_threshold
|
||||
|
||||
|
||||
class NewEmptyTensorOp(torch.autograd.Function):
|
||||
|
||||
@staticmethod
|
||||
def forward(ctx, x, new_shape):
|
||||
ctx.shape = x.shape
|
||||
return x.new_empty(new_shape)
|
||||
|
||||
@staticmethod
|
||||
def backward(ctx, grad):
|
||||
shape = ctx.shape
|
||||
return NewEmptyTensorOp.apply(grad, shape), None
|
||||
|
||||
|
||||
@CONV_LAYERS.register_module('Conv', force=True)
|
||||
class Conv2d(nn.Conv2d):
|
||||
|
||||
def forward(self, x):
|
||||
if x.numel() == 0 and obsolete_torch_version(TORCH_VERSION, (1, 4)):
|
||||
out_shape = [x.shape[0], self.out_channels]
|
||||
for i, k, p, s, d in zip(x.shape[-2:], self.kernel_size,
|
||||
self.padding, self.stride, self.dilation):
|
||||
o = (i + 2 * p - (d * (k - 1) + 1)) // s + 1
|
||||
out_shape.append(o)
|
||||
empty = NewEmptyTensorOp.apply(x, out_shape)
|
||||
if self.training:
|
||||
# produce dummy gradient to avoid DDP warning.
|
||||
dummy = sum(x.view(-1)[0] for x in self.parameters()) * 0.0
|
||||
return empty + dummy
|
||||
else:
|
||||
return empty
|
||||
|
||||
return super().forward(x)
|
||||
|
||||
|
||||
@CONV_LAYERS.register_module('Conv3d', force=True)
|
||||
class Conv3d(nn.Conv3d):
|
||||
|
||||
def forward(self, x):
|
||||
if x.numel() == 0 and obsolete_torch_version(TORCH_VERSION, (1, 4)):
|
||||
out_shape = [x.shape[0], self.out_channels]
|
||||
for i, k, p, s, d in zip(x.shape[-3:], self.kernel_size,
|
||||
self.padding, self.stride, self.dilation):
|
||||
o = (i + 2 * p - (d * (k - 1) + 1)) // s + 1
|
||||
out_shape.append(o)
|
||||
empty = NewEmptyTensorOp.apply(x, out_shape)
|
||||
if self.training:
|
||||
# produce dummy gradient to avoid DDP warning.
|
||||
dummy = sum(x.view(-1)[0] for x in self.parameters()) * 0.0
|
||||
return empty + dummy
|
||||
else:
|
||||
return empty
|
||||
|
||||
return super().forward(x)
|
||||
|
||||
|
||||
@CONV_LAYERS.register_module()
|
||||
@CONV_LAYERS.register_module('deconv')
|
||||
@UPSAMPLE_LAYERS.register_module('deconv', force=True)
|
||||
class ConvTranspose2d(nn.ConvTranspose2d):
|
||||
|
||||
def forward(self, x):
|
||||
if x.numel() == 0 and obsolete_torch_version(TORCH_VERSION, (1, 4)):
|
||||
out_shape = [x.shape[0], self.out_channels]
|
||||
for i, k, p, s, d, op in zip(x.shape[-2:], self.kernel_size,
|
||||
self.padding, self.stride,
|
||||
self.dilation, self.output_padding):
|
||||
out_shape.append((i - 1) * s - 2 * p + (d * (k - 1) + 1) + op)
|
||||
empty = NewEmptyTensorOp.apply(x, out_shape)
|
||||
if self.training:
|
||||
# produce dummy gradient to avoid DDP warning.
|
||||
dummy = sum(x.view(-1)[0] for x in self.parameters()) * 0.0
|
||||
return empty + dummy
|
||||
else:
|
||||
return empty
|
||||
|
||||
return super().forward(x)
|
||||
|
||||
|
||||
@CONV_LAYERS.register_module()
|
||||
@CONV_LAYERS.register_module('deconv3d')
|
||||
@UPSAMPLE_LAYERS.register_module('deconv3d', force=True)
|
||||
class ConvTranspose3d(nn.ConvTranspose3d):
|
||||
|
||||
def forward(self, x):
|
||||
if x.numel() == 0 and obsolete_torch_version(TORCH_VERSION, (1, 4)):
|
||||
out_shape = [x.shape[0], self.out_channels]
|
||||
for i, k, p, s, d, op in zip(x.shape[-3:], self.kernel_size,
|
||||
self.padding, self.stride,
|
||||
self.dilation, self.output_padding):
|
||||
out_shape.append((i - 1) * s - 2 * p + (d * (k - 1) + 1) + op)
|
||||
empty = NewEmptyTensorOp.apply(x, out_shape)
|
||||
if self.training:
|
||||
# produce dummy gradient to avoid DDP warning.
|
||||
dummy = sum(x.view(-1)[0] for x in self.parameters()) * 0.0
|
||||
return empty + dummy
|
||||
else:
|
||||
return empty
|
||||
|
||||
return super().forward(x)
|
||||
|
||||
|
||||
class MaxPool2d(nn.MaxPool2d):
|
||||
|
||||
def forward(self, x):
|
||||
# PyTorch 1.9 does not support empty tensor inference yet
|
||||
if x.numel() == 0 and obsolete_torch_version(TORCH_VERSION, (1, 9)):
|
||||
out_shape = list(x.shape[:2])
|
||||
for i, k, p, s, d in zip(x.shape[-2:], _pair(self.kernel_size),
|
||||
_pair(self.padding), _pair(self.stride),
|
||||
_pair(self.dilation)):
|
||||
o = (i + 2 * p - (d * (k - 1) + 1)) / s + 1
|
||||
o = math.ceil(o) if self.ceil_mode else math.floor(o)
|
||||
out_shape.append(o)
|
||||
empty = NewEmptyTensorOp.apply(x, out_shape)
|
||||
return empty
|
||||
|
||||
return super().forward(x)
|
||||
|
||||
|
||||
class MaxPool3d(nn.MaxPool3d):
|
||||
|
||||
def forward(self, x):
|
||||
# PyTorch 1.9 does not support empty tensor inference yet
|
||||
if x.numel() == 0 and obsolete_torch_version(TORCH_VERSION, (1, 9)):
|
||||
out_shape = list(x.shape[:2])
|
||||
for i, k, p, s, d in zip(x.shape[-3:], _triple(self.kernel_size),
|
||||
_triple(self.padding),
|
||||
_triple(self.stride),
|
||||
_triple(self.dilation)):
|
||||
o = (i + 2 * p - (d * (k - 1) + 1)) / s + 1
|
||||
o = math.ceil(o) if self.ceil_mode else math.floor(o)
|
||||
out_shape.append(o)
|
||||
empty = NewEmptyTensorOp.apply(x, out_shape)
|
||||
return empty
|
||||
|
||||
return super().forward(x)
|
||||
|
||||
|
||||
class Linear(torch.nn.Linear):
|
||||
|
||||
def forward(self, x):
|
||||
# empty tensor forward of Linear layer is supported in Pytorch 1.6
|
||||
if x.numel() == 0 and obsolete_torch_version(TORCH_VERSION, (1, 5)):
|
||||
out_shape = [x.shape[0], self.out_features]
|
||||
empty = NewEmptyTensorOp.apply(x, out_shape)
|
||||
if self.training:
|
||||
# produce dummy gradient to avoid DDP warning.
|
||||
dummy = sum(x.view(-1)[0] for x in self.parameters()) * 0.0
|
||||
return empty + dummy
|
||||
else:
|
||||
return empty
|
||||
|
||||
return super().forward(x)
|
||||
@@ -0,0 +1,30 @@
|
||||
# Copyright (c) OpenMMLab. All rights reserved.
|
||||
from ..runner import Sequential
|
||||
from ..utils import Registry, build_from_cfg
|
||||
|
||||
|
||||
def build_model_from_cfg(cfg, registry, default_args=None):
|
||||
"""Build a PyTorch model from config dict(s). Different from
|
||||
``build_from_cfg``, if cfg is a list, a ``nn.Sequential`` will be built.
|
||||
|
||||
Args:
|
||||
cfg (dict, list[dict]): The config of modules, is is either a config
|
||||
dict or a list of config dicts. If cfg is a list, a
|
||||
the built modules will be wrapped with ``nn.Sequential``.
|
||||
registry (:obj:`Registry`): A registry the module belongs to.
|
||||
default_args (dict, optional): Default arguments to build the module.
|
||||
Defaults to None.
|
||||
|
||||
Returns:
|
||||
nn.Module: A built nn module.
|
||||
"""
|
||||
if isinstance(cfg, list):
|
||||
modules = [
|
||||
build_from_cfg(cfg_, registry, default_args) for cfg_ in cfg
|
||||
]
|
||||
return Sequential(*modules)
|
||||
else:
|
||||
return build_from_cfg(cfg, registry, default_args)
|
||||
|
||||
|
||||
MODELS = Registry('model', build_func=build_model_from_cfg)
|
||||
316
extensions-builtin/forge_legacy_preprocessors/annotator/mmpkg/mmcv/cnn/resnet.py
Executable file
316
extensions-builtin/forge_legacy_preprocessors/annotator/mmpkg/mmcv/cnn/resnet.py
Executable file
@@ -0,0 +1,316 @@
|
||||
# Copyright (c) OpenMMLab. All rights reserved.
|
||||
import logging
|
||||
|
||||
import torch.nn as nn
|
||||
import torch.utils.checkpoint as cp
|
||||
|
||||
from .utils import constant_init, kaiming_init
|
||||
|
||||
|
||||
def conv3x3(in_planes, out_planes, stride=1, dilation=1):
|
||||
"""3x3 convolution with padding."""
|
||||
return nn.Conv2d(
|
||||
in_planes,
|
||||
out_planes,
|
||||
kernel_size=3,
|
||||
stride=stride,
|
||||
padding=dilation,
|
||||
dilation=dilation,
|
||||
bias=False)
|
||||
|
||||
|
||||
class BasicBlock(nn.Module):
|
||||
expansion = 1
|
||||
|
||||
def __init__(self,
|
||||
inplanes,
|
||||
planes,
|
||||
stride=1,
|
||||
dilation=1,
|
||||
downsample=None,
|
||||
style='pytorch',
|
||||
with_cp=False):
|
||||
super(BasicBlock, self).__init__()
|
||||
assert style in ['pytorch', 'caffe']
|
||||
self.conv1 = conv3x3(inplanes, planes, stride, dilation)
|
||||
self.bn1 = nn.BatchNorm2d(planes)
|
||||
self.relu = nn.ReLU(inplace=True)
|
||||
self.conv2 = conv3x3(planes, planes)
|
||||
self.bn2 = nn.BatchNorm2d(planes)
|
||||
self.downsample = downsample
|
||||
self.stride = stride
|
||||
self.dilation = dilation
|
||||
assert not with_cp
|
||||
|
||||
def forward(self, x):
|
||||
residual = x
|
||||
|
||||
out = self.conv1(x)
|
||||
out = self.bn1(out)
|
||||
out = self.relu(out)
|
||||
|
||||
out = self.conv2(out)
|
||||
out = self.bn2(out)
|
||||
|
||||
if self.downsample is not None:
|
||||
residual = self.downsample(x)
|
||||
|
||||
out += residual
|
||||
out = self.relu(out)
|
||||
|
||||
return out
|
||||
|
||||
|
||||
class Bottleneck(nn.Module):
|
||||
expansion = 4
|
||||
|
||||
def __init__(self,
|
||||
inplanes,
|
||||
planes,
|
||||
stride=1,
|
||||
dilation=1,
|
||||
downsample=None,
|
||||
style='pytorch',
|
||||
with_cp=False):
|
||||
"""Bottleneck block.
|
||||
|
||||
If style is "pytorch", the stride-two layer is the 3x3 conv layer, if
|
||||
it is "caffe", the stride-two layer is the first 1x1 conv layer.
|
||||
"""
|
||||
super(Bottleneck, self).__init__()
|
||||
assert style in ['pytorch', 'caffe']
|
||||
if style == 'pytorch':
|
||||
conv1_stride = 1
|
||||
conv2_stride = stride
|
||||
else:
|
||||
conv1_stride = stride
|
||||
conv2_stride = 1
|
||||
self.conv1 = nn.Conv2d(
|
||||
inplanes, planes, kernel_size=1, stride=conv1_stride, bias=False)
|
||||
self.conv2 = nn.Conv2d(
|
||||
planes,
|
||||
planes,
|
||||
kernel_size=3,
|
||||
stride=conv2_stride,
|
||||
padding=dilation,
|
||||
dilation=dilation,
|
||||
bias=False)
|
||||
|
||||
self.bn1 = nn.BatchNorm2d(planes)
|
||||
self.bn2 = nn.BatchNorm2d(planes)
|
||||
self.conv3 = nn.Conv2d(
|
||||
planes, planes * self.expansion, kernel_size=1, bias=False)
|
||||
self.bn3 = nn.BatchNorm2d(planes * self.expansion)
|
||||
self.relu = nn.ReLU(inplace=True)
|
||||
self.downsample = downsample
|
||||
self.stride = stride
|
||||
self.dilation = dilation
|
||||
self.with_cp = with_cp
|
||||
|
||||
def forward(self, x):
|
||||
|
||||
def _inner_forward(x):
|
||||
residual = x
|
||||
|
||||
out = self.conv1(x)
|
||||
out = self.bn1(out)
|
||||
out = self.relu(out)
|
||||
|
||||
out = self.conv2(out)
|
||||
out = self.bn2(out)
|
||||
out = self.relu(out)
|
||||
|
||||
out = self.conv3(out)
|
||||
out = self.bn3(out)
|
||||
|
||||
if self.downsample is not None:
|
||||
residual = self.downsample(x)
|
||||
|
||||
out += residual
|
||||
|
||||
return out
|
||||
|
||||
if self.with_cp and x.requires_grad:
|
||||
out = cp.checkpoint(_inner_forward, x)
|
||||
else:
|
||||
out = _inner_forward(x)
|
||||
|
||||
out = self.relu(out)
|
||||
|
||||
return out
|
||||
|
||||
|
||||
def make_res_layer(block,
|
||||
inplanes,
|
||||
planes,
|
||||
blocks,
|
||||
stride=1,
|
||||
dilation=1,
|
||||
style='pytorch',
|
||||
with_cp=False):
|
||||
downsample = None
|
||||
if stride != 1 or inplanes != planes * block.expansion:
|
||||
downsample = nn.Sequential(
|
||||
nn.Conv2d(
|
||||
inplanes,
|
||||
planes * block.expansion,
|
||||
kernel_size=1,
|
||||
stride=stride,
|
||||
bias=False),
|
||||
nn.BatchNorm2d(planes * block.expansion),
|
||||
)
|
||||
|
||||
layers = []
|
||||
layers.append(
|
||||
block(
|
||||
inplanes,
|
||||
planes,
|
||||
stride,
|
||||
dilation,
|
||||
downsample,
|
||||
style=style,
|
||||
with_cp=with_cp))
|
||||
inplanes = planes * block.expansion
|
||||
for _ in range(1, blocks):
|
||||
layers.append(
|
||||
block(inplanes, planes, 1, dilation, style=style, with_cp=with_cp))
|
||||
|
||||
return nn.Sequential(*layers)
|
||||
|
||||
|
||||
class ResNet(nn.Module):
|
||||
"""ResNet backbone.
|
||||
|
||||
Args:
|
||||
depth (int): Depth of resnet, from {18, 34, 50, 101, 152}.
|
||||
num_stages (int): Resnet stages, normally 4.
|
||||
strides (Sequence[int]): Strides of the first block of each stage.
|
||||
dilations (Sequence[int]): Dilation of each stage.
|
||||
out_indices (Sequence[int]): Output from which stages.
|
||||
style (str): `pytorch` or `caffe`. If set to "pytorch", the stride-two
|
||||
layer is the 3x3 conv layer, otherwise the stride-two layer is
|
||||
the first 1x1 conv layer.
|
||||
frozen_stages (int): Stages to be frozen (all param fixed). -1 means
|
||||
not freezing any parameters.
|
||||
bn_eval (bool): Whether to set BN layers as eval mode, namely, freeze
|
||||
running stats (mean and var).
|
||||
bn_frozen (bool): Whether to freeze weight and bias of BN layers.
|
||||
with_cp (bool): Use checkpoint or not. Using checkpoint will save some
|
||||
memory while slowing down the training speed.
|
||||
"""
|
||||
|
||||
arch_settings = {
|
||||
18: (BasicBlock, (2, 2, 2, 2)),
|
||||
34: (BasicBlock, (3, 4, 6, 3)),
|
||||
50: (Bottleneck, (3, 4, 6, 3)),
|
||||
101: (Bottleneck, (3, 4, 23, 3)),
|
||||
152: (Bottleneck, (3, 8, 36, 3))
|
||||
}
|
||||
|
||||
def __init__(self,
|
||||
depth,
|
||||
num_stages=4,
|
||||
strides=(1, 2, 2, 2),
|
||||
dilations=(1, 1, 1, 1),
|
||||
out_indices=(0, 1, 2, 3),
|
||||
style='pytorch',
|
||||
frozen_stages=-1,
|
||||
bn_eval=True,
|
||||
bn_frozen=False,
|
||||
with_cp=False):
|
||||
super(ResNet, self).__init__()
|
||||
if depth not in self.arch_settings:
|
||||
raise KeyError(f'invalid depth {depth} for resnet')
|
||||
assert num_stages >= 1 and num_stages <= 4
|
||||
block, stage_blocks = self.arch_settings[depth]
|
||||
stage_blocks = stage_blocks[:num_stages]
|
||||
assert len(strides) == len(dilations) == num_stages
|
||||
assert max(out_indices) < num_stages
|
||||
|
||||
self.out_indices = out_indices
|
||||
self.style = style
|
||||
self.frozen_stages = frozen_stages
|
||||
self.bn_eval = bn_eval
|
||||
self.bn_frozen = bn_frozen
|
||||
self.with_cp = with_cp
|
||||
|
||||
self.inplanes = 64
|
||||
self.conv1 = nn.Conv2d(
|
||||
3, 64, kernel_size=7, stride=2, padding=3, bias=False)
|
||||
self.bn1 = nn.BatchNorm2d(64)
|
||||
self.relu = nn.ReLU(inplace=True)
|
||||
self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1)
|
||||
|
||||
self.res_layers = []
|
||||
for i, num_blocks in enumerate(stage_blocks):
|
||||
stride = strides[i]
|
||||
dilation = dilations[i]
|
||||
planes = 64 * 2**i
|
||||
res_layer = make_res_layer(
|
||||
block,
|
||||
self.inplanes,
|
||||
planes,
|
||||
num_blocks,
|
||||
stride=stride,
|
||||
dilation=dilation,
|
||||
style=self.style,
|
||||
with_cp=with_cp)
|
||||
self.inplanes = planes * block.expansion
|
||||
layer_name = f'layer{i + 1}'
|
||||
self.add_module(layer_name, res_layer)
|
||||
self.res_layers.append(layer_name)
|
||||
|
||||
self.feat_dim = block.expansion * 64 * 2**(len(stage_blocks) - 1)
|
||||
|
||||
def init_weights(self, pretrained=None):
|
||||
if isinstance(pretrained, str):
|
||||
logger = logging.getLogger()
|
||||
from ..runner import load_checkpoint
|
||||
load_checkpoint(self, pretrained, strict=False, logger=logger)
|
||||
elif pretrained is None:
|
||||
for m in self.modules():
|
||||
if isinstance(m, nn.Conv2d):
|
||||
kaiming_init(m)
|
||||
elif isinstance(m, nn.BatchNorm2d):
|
||||
constant_init(m, 1)
|
||||
else:
|
||||
raise TypeError('pretrained must be a str or None')
|
||||
|
||||
def forward(self, x):
|
||||
x = self.conv1(x)
|
||||
x = self.bn1(x)
|
||||
x = self.relu(x)
|
||||
x = self.maxpool(x)
|
||||
outs = []
|
||||
for i, layer_name in enumerate(self.res_layers):
|
||||
res_layer = getattr(self, layer_name)
|
||||
x = res_layer(x)
|
||||
if i in self.out_indices:
|
||||
outs.append(x)
|
||||
if len(outs) == 1:
|
||||
return outs[0]
|
||||
else:
|
||||
return tuple(outs)
|
||||
|
||||
def train(self, mode=True):
|
||||
super(ResNet, self).train(mode)
|
||||
if self.bn_eval:
|
||||
for m in self.modules():
|
||||
if isinstance(m, nn.BatchNorm2d):
|
||||
m.eval()
|
||||
if self.bn_frozen:
|
||||
for params in m.parameters():
|
||||
params.requires_grad = False
|
||||
if mode and self.frozen_stages >= 0:
|
||||
for param in self.conv1.parameters():
|
||||
param.requires_grad = False
|
||||
for param in self.bn1.parameters():
|
||||
param.requires_grad = False
|
||||
self.bn1.eval()
|
||||
self.bn1.weight.requires_grad = False
|
||||
self.bn1.bias.requires_grad = False
|
||||
for i in range(1, self.frozen_stages + 1):
|
||||
mod = getattr(self, f'layer{i}')
|
||||
mod.eval()
|
||||
for param in mod.parameters():
|
||||
param.requires_grad = False
|
||||
@@ -0,0 +1,19 @@
|
||||
# Copyright (c) OpenMMLab. All rights reserved.
|
||||
from .flops_counter import get_model_complexity_info
|
||||
from .fuse_conv_bn import fuse_conv_bn
|
||||
from .sync_bn import revert_sync_batchnorm
|
||||
from .weight_init import (INITIALIZERS, Caffe2XavierInit, ConstantInit,
|
||||
KaimingInit, NormalInit, PretrainedInit,
|
||||
TruncNormalInit, UniformInit, XavierInit,
|
||||
bias_init_with_prob, caffe2_xavier_init,
|
||||
constant_init, initialize, kaiming_init, normal_init,
|
||||
trunc_normal_init, uniform_init, xavier_init)
|
||||
|
||||
__all__ = [
|
||||
'get_model_complexity_info', 'bias_init_with_prob', 'caffe2_xavier_init',
|
||||
'constant_init', 'kaiming_init', 'normal_init', 'trunc_normal_init',
|
||||
'uniform_init', 'xavier_init', 'fuse_conv_bn', 'initialize',
|
||||
'INITIALIZERS', 'ConstantInit', 'XavierInit', 'NormalInit',
|
||||
'TruncNormalInit', 'UniformInit', 'KaimingInit', 'PretrainedInit',
|
||||
'Caffe2XavierInit', 'revert_sync_batchnorm'
|
||||
]
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user