Initial commit

This commit is contained in:
huchenlei
2024-01-03 00:39:16 -05:00
commit a1f793c0a7
75 changed files with 10090 additions and 0 deletions

42
hand_refiner/__init__.py Normal file
View File

@@ -0,0 +1,42 @@
import numpy as np
from PIL import Image
from .util import resize_image_with_pad, common_input_validate, HWC3, custom_hf_download
from hand_refiner.pipeline import MeshGraphormerMediapipe, args
class MeshGraphormerDetector:
def __init__(self, pipeline):
self.pipeline = pipeline
@classmethod
def from_pretrained(cls, pretrained_model_or_path, filename=None, hrnet_filename=None, cache_dir=None, device="cuda"):
filename = filename or "graphormer_hand_state_dict.bin"
hrnet_filename = hrnet_filename or "hrnetv2_w64_imagenet_pretrained.pth"
args.resume_checkpoint = custom_hf_download(pretrained_model_or_path, filename, cache_dir)
args.hrnet_checkpoint = custom_hf_download(pretrained_model_or_path, hrnet_filename, cache_dir)
args.device = device
pipeline = MeshGraphormerMediapipe(args)
return cls(pipeline)
def to(self, device):
self.pipeline._model.to(device)
self.pipeline.mano_model.to(device)
self.pipeline.mano_model.layer.to(device)
return self
def __call__(self, input_image=None, mask_bbox_padding=30, detect_resolution=512, output_type=None, upscale_method="INTER_CUBIC", **kwargs):
input_image, output_type = common_input_validate(input_image, output_type, **kwargs)
depth_map, mask, info = self.pipeline.get_depth(input_image, mask_bbox_padding)
if depth_map is None:
depth_map = np.zeros_like(input_image)
mask = np.zeros_like(input_image)
#The hand is small
depth_map, mask = HWC3(depth_map), HWC3(mask)
depth_map, remove_pad = resize_image_with_pad(depth_map, detect_resolution, upscale_method)
depth_map = remove_pad(depth_map)
if output_type == "pil":
depth_map = Image.fromarray(depth_map)
mask = Image.fromarray(mask)
return depth_map, mask, info

View File

@@ -0,0 +1,92 @@
GPUS: (0,1,2,3)
LOG_DIR: 'log/'
DATA_DIR: ''
OUTPUT_DIR: 'output/'
WORKERS: 4
PRINT_FREQ: 1000
MODEL:
NAME: cls_hrnet
IMAGE_SIZE:
- 224
- 224
EXTRA:
STAGE1:
NUM_MODULES: 1
NUM_RANCHES: 1
BLOCK: BOTTLENECK
NUM_BLOCKS:
- 4
NUM_CHANNELS:
- 64
FUSE_METHOD: SUM
STAGE2:
NUM_MODULES: 1
NUM_BRANCHES: 2
BLOCK: BASIC
NUM_BLOCKS:
- 4
- 4
NUM_CHANNELS:
- 64
- 128
FUSE_METHOD: SUM
STAGE3:
NUM_MODULES: 4
NUM_BRANCHES: 3
BLOCK: BASIC
NUM_BLOCKS:
- 4
- 4
- 4
NUM_CHANNELS:
- 64
- 128
- 256
FUSE_METHOD: SUM
STAGE4:
NUM_MODULES: 3
NUM_BRANCHES: 4
BLOCK: BASIC
NUM_BLOCKS:
- 4
- 4
- 4
- 4
NUM_CHANNELS:
- 64
- 128
- 256
- 512
FUSE_METHOD: SUM
CUDNN:
BENCHMARK: true
DETERMINISTIC: false
ENABLED: true
DATASET:
DATASET: 'imagenet'
DATA_FORMAT: 'jpg'
ROOT: 'data/imagenet/'
TEST_SET: 'val'
TRAIN_SET: 'train'
TEST:
BATCH_SIZE_PER_GPU: 32
MODEL_FILE: ''
TRAIN:
BATCH_SIZE_PER_GPU: 32
BEGIN_EPOCH: 0
END_EPOCH: 100
RESUME: true
LR_FACTOR: 0.1
LR_STEP:
- 30
- 60
- 90
OPTIMIZER: sgd
LR: 0.05
WD: 0.0001
MOMENTUM: 0.9
NESTEROV: true
SHUFFLE: true
DEBUG:
DEBUG: false

View File

@@ -0,0 +1,6 @@
class Preprocessor:
def __init__(self) -> None:
pass
def get_depth(self, input_dir, file_name):
return

Binary file not shown.

468
hand_refiner/pipeline.py Normal file
View File

@@ -0,0 +1,468 @@
import os
import torch
import gc
import numpy as np
from hand_refiner.depth_preprocessor import Preprocessor
import torchvision.models as models
from mesh_graphormer.modeling.bert import BertConfig, Graphormer
from mesh_graphormer.modeling.bert import Graphormer_Hand_Network as Graphormer_Network
from mesh_graphormer.modeling._mano import MANO, Mesh
from mesh_graphormer.modeling.hrnet.hrnet_cls_net_gridfeat import get_cls_net_gridfeat
from mesh_graphormer.modeling.hrnet.config import config as hrnet_config
from mesh_graphormer.modeling.hrnet.config import update_config as hrnet_update_config
from mesh_graphormer.utils.miscellaneous import set_seed
from argparse import Namespace
from pathlib import Path
import cv2
from torchvision import transforms
import numpy as np
import cv2
from trimesh import Trimesh
from trimesh.ray.ray_triangle import RayMeshIntersector
import mediapipe as mp
from mediapipe.tasks import python
from mediapipe.tasks.python import vision
from torchvision import transforms
from pathlib import Path
import mesh_graphormer
from packaging import version
args = Namespace(
num_workers=4,
img_scale_factor=1,
image_file_or_path=os.path.join('', 'MeshGraphormer', 'samples', 'hand'),
model_name_or_path=str(Path(mesh_graphormer.__file__).parent / "modeling/bert/bert-base-uncased"),
resume_checkpoint=None,
output_dir='output/',
config_name='',
a='hrnet-w64',
arch='hrnet-w64',
num_hidden_layers=4,
hidden_size=-1,
num_attention_heads=4,
intermediate_size=-1,
input_feat_dim='2051,512,128',
hidden_feat_dim='1024,256,64',
which_gcn='0,0,1',
mesh_type='hand',
run_eval_only=True,
device="cpu",
seed=88,
hrnet_checkpoint=None,
)
#Since mediapipe v0.10.5, the hand category has been correct
if version.parse(mp.__version__) >= version.parse('0.10.5'):
true_hand_category = {"Right": "right", "Left": "left"}
else:
true_hand_category = {"Right": "left", "Left": "right"}
class MeshGraphormerMediapipe(Preprocessor):
def __init__(self, args=args) -> None:
# Setup CUDA, GPU & distributed training
args.num_gpus = int(os.environ['WORLD_SIZE']) if 'WORLD_SIZE' in os.environ else 1
os.environ['OMP_NUM_THREADS'] = str(args.num_workers)
print('set os.environ[OMP_NUM_THREADS] to {}'.format(os.environ['OMP_NUM_THREADS']))
#mkdir(args.output_dir)
#logger = setup_logger("Graphormer", args.output_dir, get_rank())
set_seed(args.seed, args.num_gpus)
#logger.info("Using {} GPUs".format(args.num_gpus))
# Mesh and MANO utils
mano_model = MANO().to(args.device)
mano_model.layer = mano_model.layer.to(args.device)
mesh_sampler = Mesh(device=args.device)
# Renderer for visualization
# renderer = Renderer(faces=mano_model.face)
# Load pretrained model
trans_encoder = []
input_feat_dim = [int(item) for item in args.input_feat_dim.split(',')]
hidden_feat_dim = [int(item) for item in args.hidden_feat_dim.split(',')]
output_feat_dim = input_feat_dim[1:] + [3]
# which encoder block to have graph convs
which_blk_graph = [int(item) for item in args.which_gcn.split(',')]
if args.run_eval_only==True and args.resume_checkpoint!=None and args.resume_checkpoint!='None' and 'state_dict' not in args.resume_checkpoint:
# if only run eval, load checkpoint
#logger.info("Evaluation: Loading from checkpoint {}".format(args.resume_checkpoint))
_model = torch.load(args.resume_checkpoint)
else:
# init three transformer-encoder blocks in a loop
for i in range(len(output_feat_dim)):
config_class, model_class = BertConfig, Graphormer
config = config_class.from_pretrained(args.config_name if args.config_name \
else args.model_name_or_path)
config.output_attentions = False
config.img_feature_dim = input_feat_dim[i]
config.output_feature_dim = output_feat_dim[i]
args.hidden_size = hidden_feat_dim[i]
args.intermediate_size = int(args.hidden_size*2)
if which_blk_graph[i]==1:
config.graph_conv = True
#logger.info("Add Graph Conv")
else:
config.graph_conv = False
config.mesh_type = args.mesh_type
# update model structure if specified in arguments
update_params = ['num_hidden_layers', 'hidden_size', 'num_attention_heads', 'intermediate_size']
for idx, param in enumerate(update_params):
arg_param = getattr(args, param)
config_param = getattr(config, param)
if arg_param > 0 and arg_param != config_param:
#logger.info("Update config parameter {}: {} -> {}".format(param, config_param, arg_param))
setattr(config, param, arg_param)
# init a transformer encoder and append it to a list
assert config.hidden_size % config.num_attention_heads == 0
model = model_class(config=config)
#logger.info("Init model from scratch.")
trans_encoder.append(model)
# create backbone model
if args.arch=='hrnet':
hrnet_yaml = Path(__file__).parent / 'cls_hrnet_w40_sgd_lr5e-2_wd1e-4_bs32_x100.yaml'
hrnet_checkpoint = args.hrnet_checkpoint
hrnet_update_config(hrnet_config, hrnet_yaml)
backbone = get_cls_net_gridfeat(hrnet_config, pretrained=hrnet_checkpoint)
#logger.info('=> loading hrnet-v2-w40 model')
elif args.arch=='hrnet-w64':
hrnet_yaml = Path(__file__).parent / 'cls_hrnet_w64_sgd_lr5e-2_wd1e-4_bs32_x100.yaml'
hrnet_checkpoint = args.hrnet_checkpoint
hrnet_update_config(hrnet_config, hrnet_yaml)
backbone = get_cls_net_gridfeat(hrnet_config, pretrained=hrnet_checkpoint)
#logger.info('=> loading hrnet-v2-w64 model')
else:
print("=> using pre-trained model '{}'".format(args.arch))
backbone = models.__dict__[args.arch](pretrained=True)
# remove the last fc layer
backbone = torch.nn.Sequential(*list(backbone.children())[:-1])
trans_encoder = torch.nn.Sequential(*trans_encoder)
total_params = sum(p.numel() for p in trans_encoder.parameters())
#logger.info('Graphormer encoders total parameters: {}'.format(total_params))
backbone_total_params = sum(p.numel() for p in backbone.parameters())
#logger.info('Backbone total parameters: {}'.format(backbone_total_params))
# build end-to-end Graphormer network (CNN backbone + multi-layer Graphormer encoder)
_model = Graphormer_Network(args, config, backbone, trans_encoder)
if args.resume_checkpoint!=None and args.resume_checkpoint!='None':
# for fine-tuning or resume training or inference, load weights from checkpoint
#logger.info("Loading state dict from checkpoint {}".format(args.resume_checkpoint))
# workaround approach to load sparse tensor in graph conv.
state_dict = torch.load(args.resume_checkpoint)
_model.load_state_dict(state_dict, strict=False)
del state_dict
gc.collect()
# update configs to enable attention outputs
setattr(_model.trans_encoder[-1].config,'output_attentions', True)
setattr(_model.trans_encoder[-1].config,'output_hidden_states', True)
_model.trans_encoder[-1].bert.encoder.output_attentions = True
_model.trans_encoder[-1].bert.encoder.output_hidden_states = True
for iter_layer in range(4):
_model.trans_encoder[-1].bert.encoder.layer[iter_layer].attention.self.output_attentions = True
for inter_block in range(3):
setattr(_model.trans_encoder[-1].config,'device', args.device)
_model.to(args.device)
self._model = _model
self.mano_model = mano_model
self.mesh_sampler = mesh_sampler
self.transform = transforms.Compose([
transforms.ToTensor(),
transforms.Normalize(
mean=[0.485, 0.456, 0.406],
std=[0.229, 0.224, 0.225])])
base_options = python.BaseOptions(model_asset_path=str( Path(__file__).parent / "hand_landmarker.task" ))
options = vision.HandLandmarkerOptions(base_options=base_options,
min_hand_detection_confidence=0.6,
min_hand_presence_confidence=0.6,
min_tracking_confidence=0.6,
num_hands=2)
self.detector = vision.HandLandmarker.create_from_options(options)
def get_rays(self, W, H, fx, fy, cx, cy, c2w_t, center_pixels): # rot = I
j, i = np.meshgrid(np.arange(H, dtype=np.float32), np.arange(W, dtype=np.float32))
if center_pixels:
i = i.copy() + 0.5
j = j.copy() + 0.5
directions = np.stack([(i - cx) / fx, (j - cy) / fy, np.ones_like(i)], -1)
directions /= np.linalg.norm(directions, axis=-1, keepdims=True)
rays_o = np.expand_dims(c2w_t,0).repeat(H*W, 0)
rays_d = directions # (H, W, 3)
rays_d = (rays_d / np.linalg.norm(rays_d, axis=-1, keepdims=True)).reshape(-1,3)
return rays_o, rays_d
def get_mask_bounding_box(self, extrema, H, W, padding=30, dynamic_resize=0.15):
x_min, x_max, y_min, y_max = extrema
bb_xpad = max(int((x_max - x_min + 1) * dynamic_resize), padding)
bb_ypad = max(int((y_max - y_min + 1) * dynamic_resize), padding)
bbx_min = np.max((x_min - bb_xpad, 0))
bbx_max = np.min((x_max + bb_xpad, W-1))
bby_min = np.max((y_min - bb_ypad, 0))
bby_max = np.min((y_max + bb_ypad, H-1))
return bbx_min, bbx_max, bby_min, bby_max
def run_inference(self, img, Graphormer_model, mano, mesh_sampler, scale, crop_len):
global args
H, W = int(crop_len), int(crop_len)
Graphormer_model.eval()
mano.eval()
device = next(Graphormer_model.parameters()).device
with torch.no_grad():
img_tensor = self.transform(img)
batch_imgs = torch.unsqueeze(img_tensor, 0).to(device)
# forward-pass
pred_camera, pred_3d_joints, pred_vertices_sub, pred_vertices, hidden_states, att = Graphormer_model(batch_imgs, mano, mesh_sampler)
# obtain 3d joints, which are regressed from the full mesh
pred_3d_joints_from_mesh = mano.get_3d_joints(pred_vertices)
# obtain 2d joints, which are projected from 3d joints of mesh
#pred_2d_joints_from_mesh = orthographic_projection(pred_3d_joints_from_mesh.contiguous(), pred_camera.contiguous())
#pred_2d_coarse_vertices_from_mesh = orthographic_projection(pred_vertices_sub.contiguous(), pred_camera.contiguous())
pred_camera = pred_camera.cpu()
pred_vertices = pred_vertices.cpu()
mesh = Trimesh(vertices=pred_vertices[0], faces=mano.face)
res = crop_len
focal_length = 1000 * scale
camera_t = np.array([-pred_camera[1], -pred_camera[2], -2*focal_length/(res * pred_camera[0] +1e-9)])
pred_3d_joints_camera = pred_3d_joints_from_mesh.cpu()[0] - camera_t
z_3d_dist = pred_3d_joints_camera[:,2].clone()
pred_2d_joints_img_space = ((pred_3d_joints_camera/z_3d_dist[:,None]) * np.array((focal_length, focal_length, 1)))[:,:2] + np.array((W/2, H/2))
rays_o, rays_d = self.get_rays(W, H, focal_length, focal_length, W/2, H/2, camera_t, True)
coords = np.array(list(np.ndindex(H,W))).reshape(H,W,-1).transpose(1,0,2).reshape(-1,2)
intersector = RayMeshIntersector(mesh)
points, index_ray, _ = intersector.intersects_location(rays_o, rays_d, multiple_hits=False)
tri_index = intersector.intersects_first(rays_o, rays_d)
tri_index = tri_index[index_ray]
assert len(index_ray) == len(tri_index)
discriminator = (np.sum(mesh.face_normals[tri_index]* rays_d[index_ray], axis=-1)<= 0)
points = points[discriminator] # ray intesects in interior faces, discard them
if len(points) == 0:
return None, None
depth = (points + camera_t)[:,-1]
index_ray = index_ray[discriminator]
pixel_ray = coords[index_ray]
minval = np.min(depth)
maxval = np.max(depth)
depthmap = np.zeros([H,W])
depthmap[pixel_ray[:, 0], pixel_ray[:, 1]] = 1.0 - (0.8 * (depth - minval) / (maxval - minval))
depthmap *= 255
return depthmap, pred_2d_joints_img_space
def get_depth(self, np_image, padding):
info = {}
# STEP 3: Load the input image.
#https://stackoverflow.com/a/76407270
image = mp.Image(image_format=mp.ImageFormat.SRGB, data=np_image.copy())
# STEP 4: Detect hand landmarks from the input image.
detection_result = self.detector.detect(image)
handedness_list = detection_result.handedness
hand_landmarks_list = detection_result.hand_landmarks
raw_image = image.numpy_view()
H, W, C = raw_image.shape
# HANDLANDMARKS CAN BE EMPTY, HANDLE THIS!
if len(hand_landmarks_list) == 0:
return None, None, None
raw_image = raw_image[:, :, :3]
padded_image = np.zeros((H*2, W*2, 3))
padded_image[int(1/2 * H):int(3/2 * H), int(1/2 * W):int(3/2 * W)] = raw_image
hand_landmarks_list, handedness_list = zip(
*sorted(
zip(hand_landmarks_list, handedness_list), key=lambda x: x[0][9].z, reverse=True
)
)
padded_depthmap = np.zeros((H*2, W*2))
mask = np.zeros((H, W))
crop_boxes = []
#bboxes = []
groundtruth_2d_keypoints = []
hands = []
depth_failure = False
crop_lens = []
for idx in range(len(hand_landmarks_list)):
hand = true_hand_category[handedness_list[idx][0].category_name]
hands.append(hand)
hand_landmarks = hand_landmarks_list[idx]
handedness = handedness_list[idx]
height, width, _ = raw_image.shape
x_coordinates = [landmark.x for landmark in hand_landmarks]
y_coordinates = [landmark.y for landmark in hand_landmarks]
# x_min, x_max, y_min, y_max: extrema from mediapipe keypoint detection
x_min = int(min(x_coordinates) * width)
x_max = int(max(x_coordinates) * width)
x_c = (x_min + x_max)//2
y_min = int(min(y_coordinates) * height)
y_max = int(max(y_coordinates) * height)
y_c = (y_min + y_max)//2
#if x_max - x_min < 60 or y_max - y_min < 60:
# continue
crop_len = (max(x_max - x_min, y_max - y_min) * 1.6) //2 * 2
# crop_x_min, crop_x_max, crop_y_min, crop_y_max: bounding box for mesh reconstruction
crop_x_min = int(x_c - (crop_len/2 - 1) + W/2)
crop_x_max = int(x_c + crop_len/2 + W/2)
crop_y_min = int(y_c - (crop_len/2 - 1) + H/2)
crop_y_max = int(y_c + crop_len/2 + H/2)
cropped = padded_image[crop_y_min:crop_y_max+1, crop_x_min:crop_x_max+1]
crop_boxes.append([crop_y_min, crop_y_max, crop_x_min, crop_x_max])
crop_lens.append(crop_len)
if hand == "left":
cropped = cv2.flip(cropped, 1)
if crop_len < 224:
graphormer_input = cv2.resize(cropped, (224, 224), interpolation=cv2.INTER_CUBIC)
else:
graphormer_input = cv2.resize(cropped, (224, 224), interpolation=cv2.INTER_AREA)
scale = crop_len/224
cropped_depthmap, pred_2d_keypoints = self.run_inference(graphormer_input.astype(np.uint8), self._model, self.mano_model, self.mesh_sampler, scale, int(crop_len))
if cropped_depthmap is None:
depth_failure = True
break
#keypoints_image_space = pred_2d_keypoints * (crop_y_max - crop_y_min + 1)/224
groundtruth_2d_keypoints.append(pred_2d_keypoints)
if hand == "left":
cropped_depthmap = cv2.flip(cropped_depthmap, 1)
resized_cropped_depthmap = cv2.resize(cropped_depthmap, (int(crop_len), int(crop_len)), interpolation=cv2.INTER_LINEAR)
nonzero_y, nonzero_x = (resized_cropped_depthmap != 0).nonzero()
if len(nonzero_y) == 0 or len(nonzero_x) == 0:
depth_failure = True
break
padded_depthmap[crop_y_min+nonzero_y, crop_x_min+nonzero_x] = resized_cropped_depthmap[nonzero_y, nonzero_x]
# nonzero stands for nonzero value on the depth map
# coordinates of nonzero depth pixels in original image space
original_nonzero_x = crop_x_min+nonzero_x - int(W/2)
original_nonzero_y = crop_y_min+nonzero_y - int(H/2)
nonzerox_min = min(np.min(original_nonzero_x), x_min)
nonzerox_max = max(np.max(original_nonzero_x), x_max)
nonzeroy_min = min(np.min(original_nonzero_y), y_min)
nonzeroy_max = max(np.max(original_nonzero_y), y_max)
bbx_min, bbx_max, bby_min, bby_max = self.get_mask_bounding_box((nonzerox_min, nonzerox_max, nonzeroy_min, nonzeroy_max), H, W, padding)
mask[bby_min:bby_max+1, bbx_min:bbx_max+1] = 1.0
#bboxes.append([int(bbx_min), int(bbx_max), int(bby_min), int(bby_max)])
if depth_failure:
#print("cannot detect normal hands")
return None, None, None
depthmap = padded_depthmap[int(1/2 * H):int(3/2 * H), int(1/2 * W):int(3/2 * W)].astype(np.uint8)
mask = (255.0 * mask).astype(np.uint8)
info["groundtruth_2d_keypoints"] = groundtruth_2d_keypoints
info["hands"] = hands
info["crop_boxes"] = crop_boxes
info["crop_lens"] = crop_lens
return depthmap, mask, info
def get_keypoints(self, img, Graphormer_model, mano, mesh_sampler, scale, crop_len):
global args
H, W = int(crop_len), int(crop_len)
Graphormer_model.eval()
mano.eval()
device = next(Graphormer_model.parameters()).device
with torch.no_grad():
img_tensor = self.transform(img)
#print(img_tensor)
batch_imgs = torch.unsqueeze(img_tensor, 0).to(device)
# forward-pass
pred_camera, pred_3d_joints, pred_vertices_sub, pred_vertices, hidden_states, att = Graphormer_model(batch_imgs, mano, mesh_sampler)
# obtain 3d joints, which are regressed from the full mesh
pred_3d_joints_from_mesh = mano.get_3d_joints(pred_vertices)
# obtain 2d joints, which are projected from 3d joints of mesh
#pred_2d_joints_from_mesh = orthographic_projection(pred_3d_joints_from_mesh.contiguous(), pred_camera.contiguous())
#pred_2d_coarse_vertices_from_mesh = orthographic_projection(pred_vertices_sub.contiguous(), pred_camera.contiguous())
pred_camera = pred_camera.cpu()
pred_vertices = pred_vertices.cpu()
#
res = crop_len
focal_length = 1000 * scale
camera_t = np.array([-pred_camera[1], -pred_camera[2], -2*focal_length/(res * pred_camera[0] +1e-9)])
pred_3d_joints_camera = pred_3d_joints_from_mesh.cpu()[0] - camera_t
z_3d_dist = pred_3d_joints_camera[:,2].clone()
pred_2d_joints_img_space = ((pred_3d_joints_camera/z_3d_dist[:,None]) * np.array((focal_length, focal_length, 1)))[:,:2] + np.array((W/2, H/2))
return pred_2d_joints_img_space
def eval_mpjpe(self, sample, info):
H, W, C = sample.shape
padded_image = np.zeros((H*2, W*2, 3))
padded_image[int(1/2 * H):int(3/2 * H), int(1/2 * W):int(3/2 * W)] = sample
crop_boxes = info["crop_boxes"]
hands = info["hands"]
groundtruth_2d_keypoints = info["groundtruth_2d_keypoints"]
crop_lens = info["crop_lens"]
pjpe = 0
for i in range(len(crop_boxes)):#box in crop_boxes:
crop_y_min, crop_y_max, crop_x_min, crop_x_max = crop_boxes[i]
cropped = padded_image[crop_y_min:crop_y_max+1, crop_x_min:crop_x_max+1]
hand = hands[i]
if hand == "left":
cropped = cv2.flip(cropped, 1)
crop_len = crop_lens[i]
scale = crop_len/224
if crop_len < 224:
graphormer_input = cv2.resize(cropped, (224, 224), interpolation=cv2.INTER_CUBIC)
else:
graphormer_input = cv2.resize(cropped, (224, 224), interpolation=cv2.INTER_AREA)
generated_keypoint = self.get_keypoints(graphormer_input.astype(np.uint8), self._model, self.mano_model, self.mesh_sampler, scale, crop_len)
#generated_keypoint = generated_keypoint * ((crop_y_max - crop_y_min + 1)/224)
pjpe += np.sum(np.sqrt(np.sum(((generated_keypoint - groundtruth_2d_keypoints[i]) ** 2).numpy(), axis=1)))
pass
mpjpe = pjpe/(len(crop_boxes) * 21)
return mpjpe

193
hand_refiner/util.py Normal file
View File

@@ -0,0 +1,193 @@
import os
import random
import cv2
import numpy as np
from pathlib import Path
import warnings
from huggingface_hub import hf_hub_download
here = Path(__file__).parent.resolve()
def HWC3(x):
assert x.dtype == np.uint8
if x.ndim == 2:
x = x[:, :, None]
assert x.ndim == 3
H, W, C = x.shape
assert C == 1 or C == 3 or C == 4
if C == 3:
return x
if C == 1:
return np.concatenate([x, x, x], axis=2)
if C == 4:
color = x[:, :, 0:3].astype(np.float32)
alpha = x[:, :, 3:4].astype(np.float32) / 255.0
y = color * alpha + 255.0 * (1.0 - alpha)
y = y.clip(0, 255).astype(np.uint8)
return y
def make_noise_disk(H, W, C, F):
noise = np.random.uniform(low=0, high=1, size=((H // F) + 2, (W // F) + 2, C))
noise = cv2.resize(noise, (W + 2 * F, H + 2 * F), interpolation=cv2.INTER_CUBIC)
noise = noise[F: F + H, F: F + W]
noise -= np.min(noise)
noise /= np.max(noise)
if C == 1:
noise = noise[:, :, None]
return noise
def nms(x, t, s):
x = cv2.GaussianBlur(x.astype(np.float32), (0, 0), s)
f1 = np.array([[0, 0, 0], [1, 1, 1], [0, 0, 0]], dtype=np.uint8)
f2 = np.array([[0, 1, 0], [0, 1, 0], [0, 1, 0]], dtype=np.uint8)
f3 = np.array([[1, 0, 0], [0, 1, 0], [0, 0, 1]], dtype=np.uint8)
f4 = np.array([[0, 0, 1], [0, 1, 0], [1, 0, 0]], dtype=np.uint8)
y = np.zeros_like(x)
for f in [f1, f2, f3, f4]:
np.putmask(y, cv2.dilate(x, kernel=f) == x, x)
z = np.zeros_like(y, dtype=np.uint8)
z[y > t] = 255
return z
def min_max_norm(x):
x -= np.min(x)
x /= np.maximum(np.max(x), 1e-5)
return x
def safe_step(x, step=2):
y = x.astype(np.float32) * float(step + 1)
y = y.astype(np.int32).astype(np.float32) / float(step)
return y
def img2mask(img, H, W, low=10, high=90):
assert img.ndim == 3 or img.ndim == 2
assert img.dtype == np.uint8
if img.ndim == 3:
y = img[:, :, random.randrange(0, img.shape[2])]
else:
y = img
y = cv2.resize(y, (W, H), interpolation=cv2.INTER_CUBIC)
if random.uniform(0, 1) < 0.5:
y = 255 - y
return y < np.percentile(y, random.randrange(low, high))
def safer_memory(x):
# Fix many MAC/AMD problems
return np.ascontiguousarray(x.copy()).copy()
UPSCALE_METHODS = ["INTER_NEAREST", "INTER_LINEAR", "INTER_AREA", "INTER_CUBIC", "INTER_LANCZOS4"]
def get_upscale_method(method_str):
assert method_str in UPSCALE_METHODS, f"Method {method_str} not found in {UPSCALE_METHODS}"
return getattr(cv2, method_str)
def pad64(x):
return int(np.ceil(float(x) / 64.0) * 64 - x)
#https://github.com/Mikubill/sd-webui-controlnet/blob/main/scripts/processor.py#L17
#Added upscale_method param
def resize_image_with_pad(input_image, resolution, upscale_method = "", skip_hwc3=False):
if skip_hwc3:
img = input_image
else:
img = HWC3(input_image)
H_raw, W_raw, _ = img.shape
k = float(resolution) / float(min(H_raw, W_raw))
H_target = int(np.round(float(H_raw) * k))
W_target = int(np.round(float(W_raw) * k))
img = cv2.resize(img, (W_target, H_target), interpolation=get_upscale_method(upscale_method) if k > 1 else cv2.INTER_AREA)
H_pad, W_pad = pad64(H_target), pad64(W_target)
img_padded = np.pad(img, [[0, H_pad], [0, W_pad], [0, 0]], mode='edge')
def remove_pad(x):
return safer_memory(x[:H_target, :W_target, ...])
return safer_memory(img_padded), remove_pad
def common_input_validate(input_image, output_type, **kwargs):
if "img" in kwargs:
warnings.warn("img is deprecated, please use `input_image=...` instead.", DeprecationWarning)
input_image = kwargs.pop("img")
if "return_pil" in kwargs:
warnings.warn("return_pil is deprecated. Use output_type instead.", DeprecationWarning)
output_type = "pil" if kwargs["return_pil"] else "np"
if type(output_type) is bool:
warnings.warn("Passing `True` or `False` to `output_type` is deprecated and will raise an error in future versions")
if output_type:
output_type = "pil"
if input_image is None:
raise ValueError("input_image must be defined.")
if not isinstance(input_image, np.ndarray):
input_image = np.array(input_image, dtype=np.uint8)
output_type = output_type or "pil"
else:
output_type = output_type or "np"
return (input_image, output_type)
def custom_hf_download(pretrained_model_or_path, filename, cache_dir, subfolder='', use_symlinks=False):
local_dir = os.path.join(cache_dir, pretrained_model_or_path)
model_path = os.path.join(local_dir, *subfolder.split('/'), filename)
if not os.path.exists(model_path):
print(f"Failed to find {model_path}.\n Downloading from huggingface.co")
if use_symlinks:
cache_dir_d = os.getenv("HUGGINGFACE_HUB_CACHE")
if cache_dir_d is None:
import platform
if platform.system() == "Windows":
cache_dir_d = os.path.join(os.getenv("USERPROFILE"), ".cache", "huggingface", "hub")
else:
cache_dir_d = os.path.join(os.getenv("HOME"), ".cache", "huggingface", "hub")
try:
# test_link
if not os.path.exists(cache_dir_d):
os.makedirs(cache_dir_d)
open(os.path.join(cache_dir_d, f"linktest_{filename}.txt"), "w")
os.link(os.path.join(cache_dir_d, f"linktest_{filename}.txt"), os.path.join(cache_dir, f"linktest_{filename}.txt"))
os.remove(os.path.join(cache_dir, f"linktest_{filename}.txt"))
os.remove(os.path.join(cache_dir_d, f"linktest_{filename}.txt"))
print("Using symlinks to download models. \n",\
"Make sure you have enough space on your cache folder. \n",\
"And do not purge the cache folder after downloading.\n",\
"Otherwise, you will have to re-download the models every time you run the script.\n",\
"You can use USE_SYMLINKS: False in config.yaml to avoid this behavior.")
except:
print("Maybe not able to create symlink. Disable using symlinks.")
use_symlinks = False
cache_dir_d = os.path.join(cache_dir, pretrained_model_or_path, "cache")
else:
cache_dir_d = os.path.join(cache_dir, pretrained_model_or_path, "cache")
model_path = hf_hub_download(repo_id=pretrained_model_or_path,
cache_dir=cache_dir_d,
local_dir=local_dir,
subfolder=subfolder,
filename=filename,
local_dir_use_symlinks=use_symlinks,
resume_download=True,
etag_timeout=100
)
if not use_symlinks:
try:
import shutil
shutil.rmtree(cache_dir_d)
except Exception as e :
print(e)
return model_path