✨ Initial commit

2026-04-30 19:21:17 +00:00 · 2024-01-03 00:39:16 -05:00
commit a1f793c0a7
75 changed files with 10090 additions and 0 deletions
--- a/hand_refiner/init.py
+++ b/hand_refiner/init.py
@@ -0,0 +1,42 @@
+import numpy as np
+from PIL import Image
+from .util import resize_image_with_pad, common_input_validate, HWC3, custom_hf_download
+from hand_refiner.pipeline import MeshGraphormerMediapipe, args
+
+class MeshGraphormerDetector:
+    def __init__(self, pipeline):
+        self.pipeline = pipeline
+
+    @classmethod
+    def from_pretrained(cls, pretrained_model_or_path, filename=None, hrnet_filename=None, cache_dir=None, device="cuda"):
+        filename = filename or "graphormer_hand_state_dict.bin"
+        hrnet_filename = hrnet_filename or "hrnetv2_w64_imagenet_pretrained.pth"
+        args.resume_checkpoint = custom_hf_download(pretrained_model_or_path, filename, cache_dir)
+        args.hrnet_checkpoint = custom_hf_download(pretrained_model_or_path, hrnet_filename, cache_dir)
+        args.device = device
+        pipeline = MeshGraphormerMediapipe(args)
+        return cls(pipeline)
+    
+    def to(self, device):
+        self.pipeline._model.to(device)
+        self.pipeline.mano_model.to(device)
+        self.pipeline.mano_model.layer.to(device)
+        return self
+    
+    def __call__(self, input_image=None, mask_bbox_padding=30, detect_resolution=512, output_type=None, upscale_method="INTER_CUBIC", **kwargs):
+        input_image, output_type = common_input_validate(input_image, output_type, **kwargs)
+
+        depth_map, mask, info = self.pipeline.get_depth(input_image, mask_bbox_padding)
+        if depth_map is None:
+            depth_map = np.zeros_like(input_image)
+            mask = np.zeros_like(input_image)
+
+        #The hand is small
+        depth_map, mask = HWC3(depth_map), HWC3(mask)
+        depth_map, remove_pad = resize_image_with_pad(depth_map, detect_resolution, upscale_method)
+        depth_map = remove_pad(depth_map)
+        if output_type == "pil":
+            depth_map = Image.fromarray(depth_map)
+            mask = Image.fromarray(mask)
+            
+        return depth_map, mask, info
--- a/hand_refiner/cls_hrnet_w64_sgd_lr5e-2_wd1e-4_bs32_x100.yaml
+++ b/hand_refiner/cls_hrnet_w64_sgd_lr5e-2_wd1e-4_bs32_x100.yaml
@@ -0,0 +1,92 @@
+GPUS: (0,1,2,3)
+LOG_DIR: 'log/'
+DATA_DIR: ''
+OUTPUT_DIR: 'output/'
+WORKERS: 4
+PRINT_FREQ: 1000
+
+MODEL: 
+  NAME: cls_hrnet
+  IMAGE_SIZE: 
+    - 224
+    - 224
+  EXTRA:
+    STAGE1:
+      NUM_MODULES: 1
+      NUM_RANCHES: 1
+      BLOCK: BOTTLENECK
+      NUM_BLOCKS:
+      - 4
+      NUM_CHANNELS:
+      - 64
+      FUSE_METHOD: SUM
+    STAGE2:
+      NUM_MODULES: 1
+      NUM_BRANCHES: 2
+      BLOCK: BASIC
+      NUM_BLOCKS:
+      - 4
+      - 4
+      NUM_CHANNELS:
+      - 64
+      - 128
+      FUSE_METHOD: SUM
+    STAGE3:
+      NUM_MODULES: 4
+      NUM_BRANCHES: 3
+      BLOCK: BASIC
+      NUM_BLOCKS:
+      - 4
+      - 4
+      - 4
+      NUM_CHANNELS:
+      - 64
+      - 128
+      - 256
+      FUSE_METHOD: SUM
+    STAGE4:
+      NUM_MODULES: 3
+      NUM_BRANCHES: 4
+      BLOCK: BASIC
+      NUM_BLOCKS:
+      - 4
+      - 4
+      - 4
+      - 4
+      NUM_CHANNELS:
+      - 64
+      - 128
+      - 256
+      - 512
+      FUSE_METHOD: SUM
+CUDNN:
+  BENCHMARK: true
+  DETERMINISTIC: false
+  ENABLED: true
+DATASET:
+  DATASET: 'imagenet'
+  DATA_FORMAT: 'jpg'
+  ROOT: 'data/imagenet/'
+  TEST_SET: 'val'
+  TRAIN_SET: 'train'
+TEST:
+  BATCH_SIZE_PER_GPU: 32
+  MODEL_FILE: ''
+TRAIN:
+  BATCH_SIZE_PER_GPU: 32
+  BEGIN_EPOCH: 0
+  END_EPOCH: 100
+  RESUME: true
+  LR_FACTOR: 0.1
+  LR_STEP:
+  - 30
+  - 60
+  - 90
+  OPTIMIZER: sgd
+  LR: 0.05
+  WD: 0.0001
+  MOMENTUM: 0.9
+  NESTEROV: true
+  SHUFFLE: true
+DEBUG:
+  DEBUG: false
--- a/hand_refiner/depth_preprocessor.py
+++ b/hand_refiner/depth_preprocessor.py
@@ -0,0 +1,6 @@
+class Preprocessor:
+    def __init__(self) -> None:
+        pass
+
+    def get_depth(self, input_dir, file_name):
+        return
--- a/hand_refiner/hand_landmarker.task
+++ b/hand_refiner/hand_landmarker.task
--- a/hand_refiner/pipeline.py
+++ b/hand_refiner/pipeline.py
@@ -0,0 +1,468 @@
+import os
+import torch
+import gc
+import numpy as np
+from hand_refiner.depth_preprocessor import Preprocessor
+
+import torchvision.models as models
+from mesh_graphormer.modeling.bert import BertConfig, Graphormer
+from mesh_graphormer.modeling.bert import Graphormer_Hand_Network as Graphormer_Network
+from mesh_graphormer.modeling._mano import MANO, Mesh
+from mesh_graphormer.modeling.hrnet.hrnet_cls_net_gridfeat import get_cls_net_gridfeat
+from mesh_graphormer.modeling.hrnet.config import config as hrnet_config
+from mesh_graphormer.modeling.hrnet.config import update_config as hrnet_update_config
+from mesh_graphormer.utils.miscellaneous import set_seed
+from argparse import Namespace
+from pathlib import Path
+import cv2
+from torchvision import transforms
+import numpy as np
+import cv2
+from trimesh import Trimesh
+from trimesh.ray.ray_triangle import RayMeshIntersector
+import mediapipe as mp
+from mediapipe.tasks import python
+from mediapipe.tasks.python import vision
+from torchvision import transforms
+from pathlib import Path
+import mesh_graphormer
+from packaging import version
+
+args = Namespace(
+    num_workers=4,
+    img_scale_factor=1,
+    image_file_or_path=os.path.join('', 'MeshGraphormer', 'samples', 'hand'), 
+    model_name_or_path=str(Path(mesh_graphormer.__file__).parent / "modeling/bert/bert-base-uncased"),
+    resume_checkpoint=None,
+    output_dir='output/',
+    config_name='',
+    a='hrnet-w64',
+    arch='hrnet-w64',
+    num_hidden_layers=4,
+    hidden_size=-1,
+    num_attention_heads=4,
+    intermediate_size=-1,
+    input_feat_dim='2051,512,128',
+    hidden_feat_dim='1024,256,64',
+    which_gcn='0,0,1',
+    mesh_type='hand',
+    run_eval_only=True,
+    device="cpu",
+    seed=88,
+    hrnet_checkpoint=None,
+)
+
+#Since mediapipe v0.10.5, the hand category has been correct
+if version.parse(mp.__version__) >= version.parse('0.10.5'):
+    true_hand_category = {"Right": "right", "Left": "left"}
+else:
+    true_hand_category = {"Right": "left", "Left": "right"}
+
+class MeshGraphormerMediapipe(Preprocessor):
+    def __init__(self, args=args) -> None:
+        # Setup CUDA, GPU & distributed training
+        args.num_gpus = int(os.environ['WORLD_SIZE']) if 'WORLD_SIZE' in os.environ else 1
+        os.environ['OMP_NUM_THREADS'] = str(args.num_workers)
+        print('set os.environ[OMP_NUM_THREADS] to {}'.format(os.environ['OMP_NUM_THREADS']))
+
+        #mkdir(args.output_dir)
+        #logger = setup_logger("Graphormer", args.output_dir, get_rank())
+        set_seed(args.seed, args.num_gpus)
+        #logger.info("Using {} GPUs".format(args.num_gpus))
+
+        # Mesh and MANO utils
+        mano_model = MANO().to(args.device)
+        mano_model.layer = mano_model.layer.to(args.device)
+        mesh_sampler = Mesh(device=args.device)
+
+        # Renderer for visualization
+        # renderer = Renderer(faces=mano_model.face)
+
+        # Load pretrained model
+        trans_encoder = []
+
+        input_feat_dim = [int(item) for item in args.input_feat_dim.split(',')]
+        hidden_feat_dim = [int(item) for item in args.hidden_feat_dim.split(',')]
+        output_feat_dim = input_feat_dim[1:] + [3]
+
+        # which encoder block to have graph convs
+        which_blk_graph = [int(item) for item in args.which_gcn.split(',')]
+
+        if args.run_eval_only==True and args.resume_checkpoint!=None and args.resume_checkpoint!='None' and 'state_dict' not in args.resume_checkpoint:
+            # if only run eval, load checkpoint
+            #logger.info("Evaluation: Loading from checkpoint {}".format(args.resume_checkpoint))
+            _model = torch.load(args.resume_checkpoint)
+
+        else:
+            # init three transformer-encoder blocks in a loop
+            for i in range(len(output_feat_dim)):
+                config_class, model_class = BertConfig, Graphormer
+                config = config_class.from_pretrained(args.config_name if args.config_name \
+                        else args.model_name_or_path)
+
+                config.output_attentions = False
+                config.img_feature_dim = input_feat_dim[i] 
+                config.output_feature_dim = output_feat_dim[i]
+                args.hidden_size = hidden_feat_dim[i]
+                args.intermediate_size = int(args.hidden_size*2)
+
+                if which_blk_graph[i]==1:
+                    config.graph_conv = True
+                    #logger.info("Add Graph Conv")
+                else:
+                    config.graph_conv = False
+
+                config.mesh_type = args.mesh_type
+
+                # update model structure if specified in arguments
+                update_params = ['num_hidden_layers', 'hidden_size', 'num_attention_heads', 'intermediate_size']
+                for idx, param in enumerate(update_params):
+                    arg_param = getattr(args, param)
+                    config_param = getattr(config, param)
+                    if arg_param > 0 and arg_param != config_param:
+                        #logger.info("Update config parameter {}: {} -> {}".format(param, config_param, arg_param))
+                        setattr(config, param, arg_param)
+
+                # init a transformer encoder and append it to a list
+                assert config.hidden_size % config.num_attention_heads == 0
+                model = model_class(config=config) 
+                #logger.info("Init model from scratch.")
+                trans_encoder.append(model)
+            
+            # create backbone model
+            if args.arch=='hrnet':
+                hrnet_yaml = Path(__file__).parent / 'cls_hrnet_w40_sgd_lr5e-2_wd1e-4_bs32_x100.yaml'
+                hrnet_checkpoint = args.hrnet_checkpoint
+                hrnet_update_config(hrnet_config, hrnet_yaml)
+                backbone = get_cls_net_gridfeat(hrnet_config, pretrained=hrnet_checkpoint)
+                #logger.info('=> loading hrnet-v2-w40 model')
+            elif args.arch=='hrnet-w64':
+                hrnet_yaml = Path(__file__).parent / 'cls_hrnet_w64_sgd_lr5e-2_wd1e-4_bs32_x100.yaml'
+                hrnet_checkpoint = args.hrnet_checkpoint
+                hrnet_update_config(hrnet_config, hrnet_yaml)
+                backbone = get_cls_net_gridfeat(hrnet_config, pretrained=hrnet_checkpoint)
+                #logger.info('=> loading hrnet-v2-w64 model')
+            else:
+                print("=> using pre-trained model '{}'".format(args.arch))
+                backbone = models.__dict__[args.arch](pretrained=True)
+                # remove the last fc layer
+                backbone = torch.nn.Sequential(*list(backbone.children())[:-1])
+
+            trans_encoder = torch.nn.Sequential(*trans_encoder)
+            total_params = sum(p.numel() for p in trans_encoder.parameters())
+            #logger.info('Graphormer encoders total parameters: {}'.format(total_params))
+            backbone_total_params = sum(p.numel() for p in backbone.parameters())
+            #logger.info('Backbone total parameters: {}'.format(backbone_total_params))
+
+            # build end-to-end Graphormer network (CNN backbone + multi-layer Graphormer encoder)
+            _model = Graphormer_Network(args, config, backbone, trans_encoder)
+
+            if args.resume_checkpoint!=None and args.resume_checkpoint!='None':
+                # for fine-tuning or resume training or inference, load weights from checkpoint
+                #logger.info("Loading state dict from checkpoint {}".format(args.resume_checkpoint))
+                # workaround approach to load sparse tensor in graph conv.
+                state_dict = torch.load(args.resume_checkpoint)
+                _model.load_state_dict(state_dict, strict=False)
+                del state_dict
+                gc.collect()
+
+        # update configs to enable attention outputs
+        setattr(_model.trans_encoder[-1].config,'output_attentions', True)
+        setattr(_model.trans_encoder[-1].config,'output_hidden_states', True)
+        _model.trans_encoder[-1].bert.encoder.output_attentions = True
+        _model.trans_encoder[-1].bert.encoder.output_hidden_states =  True
+        for iter_layer in range(4):
+            _model.trans_encoder[-1].bert.encoder.layer[iter_layer].attention.self.output_attentions = True
+        for inter_block in range(3):
+            setattr(_model.trans_encoder[-1].config,'device', args.device)
+
+        _model.to(args.device)
+        self._model = _model
+        self.mano_model = mano_model
+        self.mesh_sampler = mesh_sampler
+
+        self.transform = transforms.Compose([           
+                    transforms.ToTensor(),
+                    transforms.Normalize(
+                        mean=[0.485, 0.456, 0.406],
+                        std=[0.229, 0.224, 0.225])])
+        
+        base_options = python.BaseOptions(model_asset_path=str( Path(__file__).parent / "hand_landmarker.task" ))
+        options = vision.HandLandmarkerOptions(base_options=base_options,
+                                            min_hand_detection_confidence=0.6,
+                                            min_hand_presence_confidence=0.6,
+                                            min_tracking_confidence=0.6,
+                                            num_hands=2)
+
+        self.detector = vision.HandLandmarker.create_from_options(options)
+        
+    
+    def get_rays(self, W, H, fx, fy, cx, cy, c2w_t, center_pixels): # rot = I
+   
+        j, i = np.meshgrid(np.arange(H, dtype=np.float32), np.arange(W, dtype=np.float32))
+        if center_pixels:
+            i = i.copy() + 0.5
+            j = j.copy() + 0.5
+
+        directions = np.stack([(i - cx) / fx, (j - cy) / fy, np.ones_like(i)], -1)
+        directions /= np.linalg.norm(directions, axis=-1, keepdims=True)
+
+        rays_o = np.expand_dims(c2w_t,0).repeat(H*W, 0)
+
+        rays_d = directions    # (H, W, 3)
+        rays_d = (rays_d / np.linalg.norm(rays_d, axis=-1, keepdims=True)).reshape(-1,3)
+
+        return rays_o, rays_d
+    
+    def get_mask_bounding_box(self, extrema, H, W, padding=30, dynamic_resize=0.15):
+        x_min, x_max, y_min, y_max = extrema
+        bb_xpad = max(int((x_max - x_min + 1) * dynamic_resize), padding)
+        bb_ypad = max(int((y_max - y_min + 1) * dynamic_resize), padding)
+        bbx_min = np.max((x_min - bb_xpad, 0))
+        bbx_max = np.min((x_max + bb_xpad, W-1))
+        bby_min = np.max((y_min - bb_ypad, 0))
+        bby_max = np.min((y_max + bb_ypad, H-1))
+        return bbx_min, bbx_max, bby_min, bby_max
+
+    def run_inference(self, img, Graphormer_model, mano, mesh_sampler, scale, crop_len):
+        global args
+        H, W = int(crop_len), int(crop_len)
+        Graphormer_model.eval()
+        mano.eval()
+        device = next(Graphormer_model.parameters()).device
+        with torch.no_grad():
+            img_tensor = self.transform(img)
+            batch_imgs = torch.unsqueeze(img_tensor, 0).to(device)
+            
+            # forward-pass
+            pred_camera, pred_3d_joints, pred_vertices_sub, pred_vertices, hidden_states, att = Graphormer_model(batch_imgs, mano, mesh_sampler)
+
+            # obtain 3d joints, which are regressed from the full mesh
+            pred_3d_joints_from_mesh = mano.get_3d_joints(pred_vertices)
+            # obtain 2d joints, which are projected from 3d joints of mesh
+            #pred_2d_joints_from_mesh = orthographic_projection(pred_3d_joints_from_mesh.contiguous(), pred_camera.contiguous())
+            #pred_2d_coarse_vertices_from_mesh = orthographic_projection(pred_vertices_sub.contiguous(), pred_camera.contiguous())
+            pred_camera = pred_camera.cpu()
+            pred_vertices = pred_vertices.cpu()
+            mesh = Trimesh(vertices=pred_vertices[0], faces=mano.face)
+            res = crop_len
+            focal_length = 1000 * scale
+            camera_t = np.array([-pred_camera[1], -pred_camera[2], -2*focal_length/(res * pred_camera[0] +1e-9)])
+            pred_3d_joints_camera = pred_3d_joints_from_mesh.cpu()[0] - camera_t
+            z_3d_dist = pred_3d_joints_camera[:,2].clone()
+
+            pred_2d_joints_img_space = ((pred_3d_joints_camera/z_3d_dist[:,None]) * np.array((focal_length, focal_length, 1)))[:,:2] + np.array((W/2, H/2))
+
+            rays_o, rays_d = self.get_rays(W, H, focal_length, focal_length, W/2, H/2, camera_t, True)
+            coords = np.array(list(np.ndindex(H,W))).reshape(H,W,-1).transpose(1,0,2).reshape(-1,2)
+            intersector = RayMeshIntersector(mesh)
+            points, index_ray, _ = intersector.intersects_location(rays_o, rays_d, multiple_hits=False)
+
+            tri_index = intersector.intersects_first(rays_o, rays_d)
+
+            tri_index = tri_index[index_ray]
+
+            assert len(index_ray) == len(tri_index)
+            
+            discriminator = (np.sum(mesh.face_normals[tri_index]* rays_d[index_ray], axis=-1)<= 0)
+            points = points[discriminator] # ray intesects in interior faces, discard them
+
+            if len(points) == 0:
+                return None, None
+            depth = (points + camera_t)[:,-1]
+            index_ray = index_ray[discriminator]
+            pixel_ray = coords[index_ray]
+
+            minval = np.min(depth)
+            maxval = np.max(depth)
+            depthmap = np.zeros([H,W])
+
+            depthmap[pixel_ray[:, 0], pixel_ray[:, 1]] = 1.0 - (0.8 * (depth - minval) / (maxval - minval))
+            depthmap *= 255
+        return depthmap, pred_2d_joints_img_space
+
+
+    def get_depth(self, np_image, padding):
+        info = {}
+
+        # STEP 3: Load the input image.
+        #https://stackoverflow.com/a/76407270
+        image = mp.Image(image_format=mp.ImageFormat.SRGB, data=np_image.copy())
+
+        # STEP 4: Detect hand landmarks from the input image.
+        detection_result = self.detector.detect(image)
+
+        handedness_list = detection_result.handedness
+        hand_landmarks_list = detection_result.hand_landmarks
+
+        raw_image = image.numpy_view()
+        H, W, C = raw_image.shape
+
+
+        # HANDLANDMARKS CAN BE EMPTY, HANDLE THIS!
+        if len(hand_landmarks_list) == 0:
+            return None, None, None
+        raw_image = raw_image[:, :, :3]
+
+        padded_image = np.zeros((H*2, W*2, 3))
+        padded_image[int(1/2 * H):int(3/2 * H), int(1/2 * W):int(3/2 * W)] = raw_image
+
+        hand_landmarks_list, handedness_list = zip(
+            *sorted(
+                zip(hand_landmarks_list, handedness_list), key=lambda x: x[0][9].z, reverse=True
+            )
+        )
+
+        padded_depthmap = np.zeros((H*2, W*2))
+        mask = np.zeros((H, W))
+        crop_boxes = []
+        #bboxes = []
+        groundtruth_2d_keypoints = []
+        hands = []
+        depth_failure = False
+        crop_lens = []
+        
+        for idx in range(len(hand_landmarks_list)):
+            hand = true_hand_category[handedness_list[idx][0].category_name]
+            hands.append(hand)
+            hand_landmarks = hand_landmarks_list[idx]
+            handedness = handedness_list[idx]
+            height, width, _ = raw_image.shape
+            x_coordinates = [landmark.x for landmark in hand_landmarks]
+            y_coordinates = [landmark.y for landmark in hand_landmarks]
+
+            # x_min, x_max, y_min, y_max: extrema from mediapipe keypoint detection
+            x_min = int(min(x_coordinates) * width)
+            x_max = int(max(x_coordinates) * width)
+            x_c = (x_min + x_max)//2
+            y_min = int(min(y_coordinates) * height)
+            y_max = int(max(y_coordinates) * height)
+            y_c = (y_min + y_max)//2
+
+            #if x_max - x_min < 60 or y_max - y_min < 60:
+            #    continue
+
+            crop_len = (max(x_max - x_min, y_max - y_min) * 1.6) //2 * 2
+
+            # crop_x_min, crop_x_max, crop_y_min, crop_y_max: bounding box for mesh reconstruction 
+            crop_x_min = int(x_c - (crop_len/2 - 1) + W/2)
+            crop_x_max = int(x_c + crop_len/2 + W/2)
+            crop_y_min = int(y_c - (crop_len/2 - 1) + H/2)
+            crop_y_max = int(y_c + crop_len/2 + H/2)
+
+            cropped = padded_image[crop_y_min:crop_y_max+1, crop_x_min:crop_x_max+1]
+            crop_boxes.append([crop_y_min, crop_y_max, crop_x_min, crop_x_max])
+            crop_lens.append(crop_len)
+            if hand == "left":
+                cropped = cv2.flip(cropped, 1)
+
+            if crop_len < 224:
+                graphormer_input = cv2.resize(cropped, (224, 224), interpolation=cv2.INTER_CUBIC)
+            else:
+                graphormer_input = cv2.resize(cropped, (224, 224), interpolation=cv2.INTER_AREA)
+            scale = crop_len/224
+            cropped_depthmap, pred_2d_keypoints = self.run_inference(graphormer_input.astype(np.uint8), self._model, self.mano_model, self.mesh_sampler, scale, int(crop_len)) 
+
+            if cropped_depthmap is None:
+                depth_failure = True
+                break
+            #keypoints_image_space = pred_2d_keypoints * (crop_y_max - crop_y_min + 1)/224
+            groundtruth_2d_keypoints.append(pred_2d_keypoints)
+            
+            if hand == "left":
+                cropped_depthmap = cv2.flip(cropped_depthmap, 1)
+            resized_cropped_depthmap = cv2.resize(cropped_depthmap, (int(crop_len), int(crop_len)), interpolation=cv2.INTER_LINEAR)
+            nonzero_y, nonzero_x = (resized_cropped_depthmap != 0).nonzero()
+            if len(nonzero_y) == 0 or len(nonzero_x) == 0:
+                depth_failure = True
+                break
+            padded_depthmap[crop_y_min+nonzero_y, crop_x_min+nonzero_x] = resized_cropped_depthmap[nonzero_y, nonzero_x]
+
+            # nonzero stands for nonzero value on the depth map
+            # coordinates of nonzero depth pixels in original image space
+            original_nonzero_x = crop_x_min+nonzero_x - int(W/2)
+            original_nonzero_y = crop_y_min+nonzero_y - int(H/2)
+            
+            nonzerox_min = min(np.min(original_nonzero_x), x_min)
+            nonzerox_max = max(np.max(original_nonzero_x), x_max)
+            nonzeroy_min = min(np.min(original_nonzero_y), y_min)
+            nonzeroy_max = max(np.max(original_nonzero_y), y_max)
+
+            bbx_min, bbx_max, bby_min, bby_max = self.get_mask_bounding_box((nonzerox_min, nonzerox_max, nonzeroy_min, nonzeroy_max), H, W, padding)
+            mask[bby_min:bby_max+1, bbx_min:bbx_max+1] = 1.0
+            #bboxes.append([int(bbx_min), int(bbx_max), int(bby_min), int(bby_max)])
+        if depth_failure:
+            #print("cannot detect normal hands")
+            return None, None, None
+        depthmap = padded_depthmap[int(1/2 * H):int(3/2 * H), int(1/2 * W):int(3/2 * W)].astype(np.uint8)
+        mask = (255.0 * mask).astype(np.uint8)
+        info["groundtruth_2d_keypoints"] = groundtruth_2d_keypoints
+        info["hands"] = hands
+        info["crop_boxes"] = crop_boxes
+        info["crop_lens"] = crop_lens
+        return depthmap, mask, info
+    
+    def get_keypoints(self, img, Graphormer_model, mano, mesh_sampler, scale, crop_len):
+        global args
+        H, W = int(crop_len), int(crop_len)
+        Graphormer_model.eval()
+        mano.eval()
+        device = next(Graphormer_model.parameters()).device
+        with torch.no_grad():
+            img_tensor = self.transform(img)
+            #print(img_tensor)
+            batch_imgs = torch.unsqueeze(img_tensor, 0).to(device)
+            
+            # forward-pass
+            pred_camera, pred_3d_joints, pred_vertices_sub, pred_vertices, hidden_states, att = Graphormer_model(batch_imgs, mano, mesh_sampler)
+
+            # obtain 3d joints, which are regressed from the full mesh
+            pred_3d_joints_from_mesh = mano.get_3d_joints(pred_vertices)
+            # obtain 2d joints, which are projected from 3d joints of mesh
+            #pred_2d_joints_from_mesh = orthographic_projection(pred_3d_joints_from_mesh.contiguous(), pred_camera.contiguous())
+            #pred_2d_coarse_vertices_from_mesh = orthographic_projection(pred_vertices_sub.contiguous(), pred_camera.contiguous())
+            pred_camera = pred_camera.cpu()
+            pred_vertices = pred_vertices.cpu()
+            #
+            res = crop_len
+            focal_length = 1000 * scale
+            camera_t = np.array([-pred_camera[1], -pred_camera[2], -2*focal_length/(res * pred_camera[0] +1e-9)])
+            pred_3d_joints_camera = pred_3d_joints_from_mesh.cpu()[0] - camera_t
+            z_3d_dist = pred_3d_joints_camera[:,2].clone()
+            pred_2d_joints_img_space = ((pred_3d_joints_camera/z_3d_dist[:,None]) * np.array((focal_length, focal_length, 1)))[:,:2] + np.array((W/2, H/2))
+            
+        return pred_2d_joints_img_space
+    
+
+    def eval_mpjpe(self, sample, info):
+        H, W, C = sample.shape
+        padded_image = np.zeros((H*2, W*2, 3))
+        padded_image[int(1/2 * H):int(3/2 * H), int(1/2 * W):int(3/2 * W)] = sample
+        crop_boxes = info["crop_boxes"]
+        hands = info["hands"]
+        groundtruth_2d_keypoints = info["groundtruth_2d_keypoints"]
+        crop_lens = info["crop_lens"]
+        pjpe = 0
+        for i in range(len(crop_boxes)):#box in crop_boxes:
+            crop_y_min, crop_y_max, crop_x_min, crop_x_max = crop_boxes[i]
+            cropped = padded_image[crop_y_min:crop_y_max+1, crop_x_min:crop_x_max+1]
+            hand = hands[i]
+            if hand == "left":
+                cropped = cv2.flip(cropped, 1)
+            crop_len = crop_lens[i]
+            scale = crop_len/224
+            if crop_len < 224:
+                graphormer_input = cv2.resize(cropped, (224, 224), interpolation=cv2.INTER_CUBIC)
+            else:
+                graphormer_input = cv2.resize(cropped, (224, 224), interpolation=cv2.INTER_AREA)
+            generated_keypoint = self.get_keypoints(graphormer_input.astype(np.uint8), self._model, self.mano_model, self.mesh_sampler, scale, crop_len)
+            #generated_keypoint = generated_keypoint * ((crop_y_max - crop_y_min + 1)/224)
+            pjpe += np.sum(np.sqrt(np.sum(((generated_keypoint - groundtruth_2d_keypoints[i]) ** 2).numpy(), axis=1)))
+            pass
+        mpjpe = pjpe/(len(crop_boxes) * 21)
+        return mpjpe
+
+
+
+
+
--- a/hand_refiner/util.py
+++ b/hand_refiner/util.py
@@ -0,0 +1,193 @@
+import os
+import random
+
+import cv2
+import numpy as np
+from pathlib import Path
+import warnings
+from huggingface_hub import hf_hub_download
+
+here = Path(__file__).parent.resolve()
+
+def HWC3(x):
+    assert x.dtype == np.uint8
+    if x.ndim == 2:
+        x = x[:, :, None]
+    assert x.ndim == 3
+    H, W, C = x.shape
+    assert C == 1 or C == 3 or C == 4
+    if C == 3:
+        return x
+    if C == 1:
+        return np.concatenate([x, x, x], axis=2)
+    if C == 4:
+        color = x[:, :, 0:3].astype(np.float32)
+        alpha = x[:, :, 3:4].astype(np.float32) / 255.0
+        y = color * alpha + 255.0 * (1.0 - alpha)
+        y = y.clip(0, 255).astype(np.uint8)
+        return y
+
+
+def make_noise_disk(H, W, C, F):
+    noise = np.random.uniform(low=0, high=1, size=((H // F) + 2, (W // F) + 2, C))
+    noise = cv2.resize(noise, (W + 2 * F, H + 2 * F), interpolation=cv2.INTER_CUBIC)
+    noise = noise[F: F + H, F: F + W]
+    noise -= np.min(noise)
+    noise /= np.max(noise)
+    if C == 1:
+        noise = noise[:, :, None]
+    return noise
+
+
+def nms(x, t, s):
+    x = cv2.GaussianBlur(x.astype(np.float32), (0, 0), s)
+
+    f1 = np.array([[0, 0, 0], [1, 1, 1], [0, 0, 0]], dtype=np.uint8)
+    f2 = np.array([[0, 1, 0], [0, 1, 0], [0, 1, 0]], dtype=np.uint8)
+    f3 = np.array([[1, 0, 0], [0, 1, 0], [0, 0, 1]], dtype=np.uint8)
+    f4 = np.array([[0, 0, 1], [0, 1, 0], [1, 0, 0]], dtype=np.uint8)
+
+    y = np.zeros_like(x)
+
+    for f in [f1, f2, f3, f4]:
+        np.putmask(y, cv2.dilate(x, kernel=f) == x, x)
+
+    z = np.zeros_like(y, dtype=np.uint8)
+    z[y > t] = 255
+    return z
+
+def min_max_norm(x):
+    x -= np.min(x)
+    x /= np.maximum(np.max(x), 1e-5)
+    return x
+
+
+def safe_step(x, step=2):
+    y = x.astype(np.float32) * float(step + 1)
+    y = y.astype(np.int32).astype(np.float32) / float(step)
+    return y
+
+
+def img2mask(img, H, W, low=10, high=90):
+    assert img.ndim == 3 or img.ndim == 2
+    assert img.dtype == np.uint8
+
+    if img.ndim == 3:
+        y = img[:, :, random.randrange(0, img.shape[2])]
+    else:
+        y = img
+
+    y = cv2.resize(y, (W, H), interpolation=cv2.INTER_CUBIC)
+
+    if random.uniform(0, 1) < 0.5:
+        y = 255 - y
+
+    return y < np.percentile(y, random.randrange(low, high))
+
+def safer_memory(x):
+    # Fix many MAC/AMD problems
+    return np.ascontiguousarray(x.copy()).copy()
+
+UPSCALE_METHODS = ["INTER_NEAREST", "INTER_LINEAR", "INTER_AREA", "INTER_CUBIC", "INTER_LANCZOS4"]
+def get_upscale_method(method_str):
+    assert method_str in UPSCALE_METHODS, f"Method {method_str} not found in {UPSCALE_METHODS}"
+    return getattr(cv2, method_str)
+
+def pad64(x):
+    return int(np.ceil(float(x) / 64.0) * 64 - x)
+
+#https://github.com/Mikubill/sd-webui-controlnet/blob/main/scripts/processor.py#L17
+#Added upscale_method param
+def resize_image_with_pad(input_image, resolution, upscale_method = "", skip_hwc3=False):
+    if skip_hwc3:
+        img = input_image
+    else:
+        img = HWC3(input_image)
+    H_raw, W_raw, _ = img.shape
+    k = float(resolution) / float(min(H_raw, W_raw))
+    H_target = int(np.round(float(H_raw) * k))
+    W_target = int(np.round(float(W_raw) * k))
+    img = cv2.resize(img, (W_target, H_target), interpolation=get_upscale_method(upscale_method) if k > 1 else cv2.INTER_AREA)
+    H_pad, W_pad = pad64(H_target), pad64(W_target)
+    img_padded = np.pad(img, [[0, H_pad], [0, W_pad], [0, 0]], mode='edge')
+
+    def remove_pad(x):
+        return safer_memory(x[:H_target, :W_target, ...])
+
+    return safer_memory(img_padded), remove_pad
+    
+def common_input_validate(input_image, output_type, **kwargs):
+    if "img" in kwargs:
+            warnings.warn("img is deprecated, please use `input_image=...` instead.", DeprecationWarning)
+            input_image = kwargs.pop("img")
+    
+    if "return_pil" in kwargs:
+            warnings.warn("return_pil is deprecated. Use output_type instead.", DeprecationWarning)
+            output_type = "pil" if kwargs["return_pil"] else "np"
+    
+    if type(output_type) is bool:
+        warnings.warn("Passing `True` or `False` to `output_type` is deprecated and will raise an error in future versions")
+        if output_type:
+            output_type = "pil"
+
+    if input_image is None:
+        raise ValueError("input_image must be defined.")
+
+    if not isinstance(input_image, np.ndarray):
+        input_image = np.array(input_image, dtype=np.uint8)
+        output_type = output_type or "pil"
+    else:
+        output_type = output_type or "np"
+    
+    return (input_image, output_type)
+
+def custom_hf_download(pretrained_model_or_path, filename, cache_dir, subfolder='', use_symlinks=False):
+    local_dir = os.path.join(cache_dir, pretrained_model_or_path)
+    model_path = os.path.join(local_dir, *subfolder.split('/'), filename)
+    
+    if not os.path.exists(model_path):
+        print(f"Failed to find {model_path}.\n Downloading from huggingface.co")
+        if use_symlinks:
+            cache_dir_d = os.getenv("HUGGINGFACE_HUB_CACHE")
+            if cache_dir_d is None:
+                import platform
+                if platform.system() == "Windows":
+                    cache_dir_d = os.path.join(os.getenv("USERPROFILE"), ".cache", "huggingface", "hub")
+                else:
+                    cache_dir_d = os.path.join(os.getenv("HOME"), ".cache", "huggingface", "hub")
+            try:
+                # test_link
+                if not os.path.exists(cache_dir_d):
+                    os.makedirs(cache_dir_d)
+                open(os.path.join(cache_dir_d, f"linktest_{filename}.txt"), "w")
+                os.link(os.path.join(cache_dir_d, f"linktest_{filename}.txt"), os.path.join(cache_dir, f"linktest_{filename}.txt"))
+                os.remove(os.path.join(cache_dir, f"linktest_{filename}.txt"))
+                os.remove(os.path.join(cache_dir_d, f"linktest_{filename}.txt"))
+                print("Using symlinks to download models. \n",\
+                      "Make sure you have enough space on your cache folder. \n",\
+                      "And do not purge the cache folder after downloading.\n",\
+                      "Otherwise, you will have to re-download the models every time you run the script.\n",\
+                      "You can use USE_SYMLINKS: False in config.yaml to avoid this behavior.")
+            except:
+                print("Maybe not able to create symlink. Disable using symlinks.")
+                use_symlinks = False
+                cache_dir_d = os.path.join(cache_dir, pretrained_model_or_path, "cache")
+        else:
+            cache_dir_d = os.path.join(cache_dir, pretrained_model_or_path, "cache")
+
+        model_path = hf_hub_download(repo_id=pretrained_model_or_path,
+            cache_dir=cache_dir_d,
+            local_dir=local_dir,
+            subfolder=subfolder,
+            filename=filename,
+            local_dir_use_symlinks=use_symlinks,
+            resume_download=True,
+            etag_timeout=100
+        )
+        if not use_symlinks:
+            try:
+                import shutil
+                shutil.rmtree(cache_dir_d)
+            except Exception as e :
+                print(e)
+    return model_path