Merge: [ConvNets/PyT] Fixed issue with unexpected keys in state_dict

nv-kkudrynski · nv-kkudrynski · commit 0107cf27e71b · 2022-10-03T08:07:34.000-07:00
diff --git a/PyTorch/Classification/ConvNets/classify.py b/PyTorch/Classification/ConvNets/classify.py
@@ -34,6 +34,7 @@
     efficientnet_quant_b4,
 )
 
+
 def available_models():
     models = {
         m.name: m
@@ -51,6 +52,7 @@ def available_models():
     }
     return models
 
+
 def add_parser_arguments(parser):
     model_names = available_models().keys()
     parser.add_argument("--image-size", default="224", type=int)
@@ -98,27 +100,41 @@ def load_jpeg_from_file(path, image_size, cuda=True):
 
 
 def check_quant_weight_correctness(checkpoint_path, model):
-    state_dict = torch.load(checkpoint_path, map_location=torch.device('cpu'))
-    state_dict = {k[len("module."):] if k.startswith("module.") else k: v for k, v in state_dict.items()}
-    quantizers_sd_keys = {f'{n[0]}._amax' for n in model.named_modules() if 'quantizer' in n[0]}
+    state_dict = torch.load(checkpoint_path, map_location=torch.device("cpu"))
+    state_dict = {
+        k[len("module.") :] if k.startswith("module.") else k: v
+        for k, v in state_dict.items()
+    }
+    quantizers_sd_keys = {
+        f"{n[0]}._amax" for n in model.named_modules() if "quantizer" in n[0]
+    }
     sd_all_keys = quantizers_sd_keys | set(model.state_dict().keys())
-    assert set(state_dict.keys()) == sd_all_keys, (f'Passed quantized architecture, but following keys are missing in '
-                                                   f'checkpoint: {list(sd_all_keys - set(state_dict.keys()))}')
+    assert set(state_dict.keys()) == sd_all_keys, (
+        f"Passed quantized architecture, but following keys are missing in "
+        f"checkpoint: {list(sd_all_keys - set(state_dict.keys()))}"
+    )
 
 
 def main(args, model_args):
     imgnet_classes = np.array(json.load(open("./LOC_synset_mapping.json", "r")))
-    model = available_models()[args.arch](**model_args.__dict__)
-    if args.arch in ['efficientnet-quant-b0', 'efficientnet-quant-b4']:
+    try:
+        model = available_models()[args.arch](**model_args.__dict__)
+    except RuntimeError as e:
+        print_in_box(
+            "Error when creating model, did you forget to run checkpoint2model script?"
+        )
+        raise e
+
+    if args.arch in ["efficientnet-quant-b0", "efficientnet-quant-b4"]:
         check_quant_weight_correctness(model_args.pretrained_from_file, model)
-        
+
     if not args.cpu:
         model = model.cuda()
     model.eval()
 
     input = load_jpeg_from_file(args.image, args.image_size, cuda=not args.cpu)
 
-    with torch.no_grad(), autocast(enabled = args.precision == "AMP"):
+    with torch.no_grad(), autocast(enabled=args.precision == "AMP"):
         output = torch.nn.functional.softmax(model(input), dim=1)
 
     output = output.float().cpu().view(-1).numpy()
@@ -129,6 +145,12 @@ def main(args, model_args):
         print(f"{c}: {100*v:.1f}%")
 
 
+def print_in_box(msg):
+    print("#" * (len(msg) + 10))
+    print(f"#### {msg} ####")
+    print("#" * (len(msg) + 10))
+
+
 if __name__ == "__main__":
     parser = argparse.ArgumentParser(description="PyTorch ImageNet Classification")
 
diff --git a/PyTorch/Classification/ConvNets/efficientnet/README.md b/PyTorch/Classification/ConvNets/efficientnet/README.md
@@ -369,7 +369,7 @@ To use your own dataset, divide it into directories. For example:
  - Training images - `train/<class id>/<image>`
  - Validation images - `val/<class id>/<image>`
 
-If your dataset has a number of classes different than 1000, you need to pass the `--num-classes N` flag to the training script.
+If your dataset has a number of classes different than 1000, you need to pass the `--num_classes N` flag to the training script.
 
 ### Training process
 
diff --git a/PyTorch/Classification/ConvNets/image_classification/logger.py b/PyTorch/Classification/ConvNets/image_classification/logger.py
@@ -409,13 +409,13 @@ def __init__(self, logger):
 
 
 class ValidationMetrics(Metrics):
-    def __init__(self, logger, prefix):
+    def __init__(self, logger, prefix, topk):
         super().__init__(logger)
         if self.logger is not None:
             self.map = {
                 "loss": [f"{prefix}.loss"],
                 "top1": [f"{prefix}.top1"],
-                "top5": [f"{prefix}.top5"],
+                f"top{topk}": [f"{prefix}.top{topk}"],
                 "compute_ips": [f"{prefix}.compute_ips"],
                 "total_ips": [f"{prefix}.total_ips"],
                 "data_time": [f"{prefix}.data_time"],
@@ -433,7 +433,7 @@ def __init__(self, logger, prefix):
                 metadata=Metrics.ACC_METADATA,
             )
             logger.register_metric(
-                f"{prefix}.top5",
+                f"{prefix}.top{topk}",
                 ACC_METER(),
                 verbosity=dllogger.Verbosity.DEFAULT,
                 metadata=Metrics.ACC_METADATA,
diff --git a/PyTorch/Classification/ConvNets/image_classification/models/model.py b/PyTorch/Classification/ConvNets/image_classification/models/model.py
@@ -47,6 +47,7 @@ def torchhub_docstring(name: str):
         pretrained (bool, True): If True, returns a model pretrained on IMAGENET dataset.
     """
 
+
 class EntryPoint:
     @staticmethod
     def create(name: str, model: Model):
@@ -119,7 +120,7 @@ def reshape(t, conv):
                     state_dict_key_map_fn(k): v for k, v in state_dict.items()
                 }
 
-            if hasattr(model, "ngc_checkpoint_remap"):
+            if pretrained and hasattr(model, "ngc_checkpoint_remap"):
                 remap_fn = model.ngc_checkpoint_remap(url=self.model.checkpoint_url)
                 state_dict = {remap_fn(k): v for k, v in state_dict.items()}
 
diff --git a/PyTorch/Classification/ConvNets/image_classification/training.py b/PyTorch/Classification/ConvNets/image_classification/training.py
@@ -252,7 +252,7 @@ def train(
     return interrupted
 
 
-def validate(infer_fn, val_loader, log_fn, prof=-1, with_loss=True):
+def validate(infer_fn, val_loader, log_fn, prof=-1, with_loss=True, topk=5):
     top1 = log.AverageMeter()
     # switch to evaluate mode
 
@@ -270,23 +270,18 @@ def validate(infer_fn, val_loader, log_fn, prof=-1, with_loss=True):
             output = infer_fn(input)
 
         with torch.no_grad():
-            prec1, prec5 = utils.accuracy(output.data, target, topk=(1, 5))
+            precs = utils.accuracy(output.data, target, topk=(1, topk))
 
             if torch.distributed.is_initialized():
                 if with_loss:
                     reduced_loss = utils.reduce_tensor(loss.detach())
-                prec1 = utils.reduce_tensor(prec1)
-                prec5 = utils.reduce_tensor(prec5)
+                precs = map(utils.reduce_tensor, precs)
             else:
                 if with_loss:
                     reduced_loss = loss.detach()
 
-        prec1 = prec1.item()
-        prec5 = prec5.item()
-        infer_result = {
-            "top1": (prec1, bs),
-            "top5": (prec5, bs),
-        }
+        precs = map(lambda t: t.item(), precs)
+        infer_result = {f"top{k}": (p, bs) for k, p in zip((1, topk), precs)}
 
         if with_loss:
             infer_result["loss"] = (reduced_loss.item(), bs)
@@ -295,7 +290,7 @@ def validate(infer_fn, val_loader, log_fn, prof=-1, with_loss=True):
 
         it_time = time.time() - end
 
-        top1.record(prec1, bs)
+        top1.record(infer_result["top1"][0], bs)
 
         log_fn(
             compute_ips=utils.calc_ips(bs, it_time - data_time),
@@ -332,6 +327,7 @@ def train_loop(
     checkpoint_dir="./",
     checkpoint_filename="checkpoint.pth.tar",
     keep_last_n_checkpoints=0,
+    topk=5,
 ):
     checkpointer = utils.Checkpointer(
         last_filename=checkpoint_filename,
@@ -340,7 +336,7 @@ def train_loop(
     )
     train_metrics = TrainingMetrics(logger)
     val_metrics = {
-        k: ValidationMetrics(logger, k) for k in trainer.validation_steps().keys()
+        k: ValidationMetrics(logger, k, topk) for k in trainer.validation_steps().keys()
     }
     training_step = trainer.train_step
 
@@ -389,6 +385,7 @@ def train_loop(
                         data_iter,
                         val_metrics[k].log,
                         prof=prof,
+                        topk=topk,
                     )
 
                     if k == "val":
diff --git a/PyTorch/Classification/ConvNets/main.py b/PyTorch/Classification/ConvNets/main.py
@@ -29,7 +29,9 @@
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 import os
 
-os.environ["KMP_AFFINITY"] = "disabled" # We need to do this before importing anything else as a workaround for this bug: https://github.com/pytorch/pytorch/issues/28389
+os.environ[
+    "KMP_AFFINITY"
+] = "disabled"  # We need to do this before importing anything else as a workaround for this bug: https://github.com/pytorch/pytorch/issues/28389
 
 import argparse
 import random
@@ -290,7 +292,7 @@ def add_parser_arguments(parser, skip_arch=False):
             "Gather N last checkpoints throughout the training,"
             " without this flag only best and last checkpoints will be stored. "
             "Use -1 for all checkpoints"
-        )
+        ),
     )
 
     parser.add_argument(
@@ -343,13 +345,6 @@ def add_parser_arguments(parser, skip_arch=False):
         choices=[None, "autoaugment"],
         help="augmentation method",
     )
-    parser.add_argument(
-        "--num-classes",
-        type=int,
-        default=None,
-        required=False,
-        help="number of classes",
-    )
 
     parser.add_argument(
         "--gpu-affinity",
@@ -359,6 +354,13 @@ def add_parser_arguments(parser, skip_arch=False):
         choices=[am.name for am in AffinityMode],
     )
 
+    parser.add_argument(
+        "--topk",
+        type=int,
+        default=5,
+        required=False,
+    )
+
 
 def prepare_for_training(args, model_args, model_arch):
     args.distributed = False
@@ -389,7 +391,7 @@ def prepare_for_training(args, model_args, model_arch):
 
         def _worker_init_fn(id):
             # Worker process should inherit its affinity from parent
-            affinity = os.sched_getaffinity(0) 
+            affinity = os.sched_getaffinity(0)
             print(f"Process {args.local_rank} Worker {id} set affinity to: {affinity}")
 
             np.random.seed(seed=args.seed + args.local_rank + id)
@@ -645,6 +647,7 @@ def main(args, model_args, model_arch):
         checkpoint_dir=args.workspace,
         checkpoint_filename=args.checkpoint_filename,
         keep_last_n_checkpoints=args.gather_checkpoints,
+        topk=args.topk,
     )
     exp_duration = time.time() - exp_start_time
     if not torch.distributed.is_initialized() or torch.distributed.get_rank() == 0:
diff --git a/PyTorch/Classification/ConvNets/resnet50v1.5/README.md b/PyTorch/Classification/ConvNets/resnet50v1.5/README.md
@@ -293,7 +293,7 @@ unzip resnet50_pyt_amp_20.06.0.zip
 
 To run inference on ImageNet, run:
 
-`python ./main.py --arch resnet50 --evaluate --epochs 1 --pretrained-weights nvidia_resnet50_200821.pth.tar -b <batch size> <path to imagenet>`
+`python ./main.py --arch resnet50 --evaluate --epochs 1 --pretrained-from-file nvidia_resnet50_200821.pth.tar -b <batch size> <path to imagenet>`
 
 To run inference on JPEG image using pretrained weights:
 
@@ -331,12 +331,12 @@ To see the full list of available options and their descriptions, use the `-h` o
 
 ```
 usage: main.py [-h] [--data-backend BACKEND] [--arch ARCH]
-               [--model-config CONF] [--num-classes N] [-j N] [--epochs N]
+               [--model-config CONF] [-j N] [--epochs N]
                [--run-epochs N] [-b N] [--optimizer-batch-size N] [--lr LR]
                [--lr-schedule SCHEDULE] [--warmup E] [--label-smoothing S]
                [--mixup ALPHA] [--momentum M] [--weight-decay W]
                [--bn-weight-decay] [--nesterov] [--print-freq N]
-               [--resume PATH] [--pretrained-weights PATH]
+               [--resume PATH] [--pretrained-from-file PATH]
                [--static-loss-scale STATIC_LOSS_SCALE] [--dynamic-loss-scale]
                [--prof N] [--amp] [--seed SEED] [--gather-checkpoints]
                [--raport-file RAPORT_FILE] [--evaluate] [--training-only]
@@ -362,7 +362,6 @@ optional arguments:
   --model-config CONF, -c CONF
                         model configs: classic | fanin | grp-fanin | grp-
                         fanout(default: classic)
-  --num-classes N       number of classes in the dataset
   -j N, --workers N     number of data loading workers (default: 5)
   --epochs N            number of total epochs to run
   --run-epochs N        run only N epochs, used for checkpointing runs
@@ -385,7 +384,7 @@ optional arguments:
   --nesterov            use nesterov momentum, (default: false)
   --print-freq N, -p N  print frequency (default: 10)
   --resume PATH         path to latest checkpoint (default: none)
-  --pretrained-weights PATH
+  --pretrained-from-file PATH
                         load weights from here
   --static-loss-scale STATIC_LOSS_SCALE
                         Static loss scale, positive power of 2 values can
@@ -418,7 +417,7 @@ To use your own dataset, divide it in directories as in the following scheme:
  - Training images - `train/<class id>/<image>`
  - Validation images - `val/<class id>/<image>`
 
-If your dataset's has number of classes different than 1000, you need to pass `--num-classes N` flag to the training script.
+If your dataset's has number of classes different than 1000, you need to pass `--num_classes N` flag to the training script.
 
 ### Training process
 
@@ -438,7 +437,7 @@ Metrics gathered through training:
 
 To restart training from checkpoint use `--resume` option.
 
-To start training from pretrained weights (e.g. downloaded from NGC) use `--pretrained-weights` option.
+To start training from pretrained weights (e.g. downloaded from NGC) use `--pretrained-from-file` option.
 
 The difference between those two is that the pretrained weights contain only model weights,
 and checkpoints, apart from model weights, contain optimizer state, LR scheduler state.
@@ -476,7 +475,7 @@ Then run classification script:
 
 You can also run ImageNet validation on pretrained weights:
 
-`python ./main.py --arch resnet50 --evaluate --epochs 1 --pretrained-weights <path to pretrained weights> -b <batch size> <path to imagenet>`
+`python ./main.py --arch resnet50 --evaluate --epochs 1 --pretrained-from-file <path to pretrained weights> -b <batch size> <path to imagenet>`
 
 #### NGC Pretrained weights:
 
@@ -489,7 +488,7 @@ unzip resnet50_pyt_amp_20.06.0.zip
 ```
 To run inference on ImageNet, run:
 
-`python ./main.py --arch resnet50 --evaluate --epochs 1 --pretrained-weights nvidia_resnet50_200821.pth.tar -b <batch size> <path to imagenet>`
+`python ./main.py --arch resnet50 --evaluate --epochs 1 --pretrained-from-file nvidia_resnet50_200821.pth.tar -b <batch size> <path to imagenet>`
 
 To run inference on JPEG image using pretrained weights:
 
diff --git a/PyTorch/Classification/ConvNets/resnext101-32x4d/README.md b/PyTorch/Classification/ConvNets/resnext101-32x4d/README.md
@@ -314,7 +314,7 @@ To see the full list of available options and their descriptions, use the `-h` o
 
 ```
 usage: main.py [-h] [--data-backend BACKEND] [--arch ARCH]
-               [--model-config CONF] [--num-classes N] [-j N] [--epochs N]
+               [--model-config CONF] [-j N] [--epochs N]
                [--run-epochs N] [-b N] [--optimizer-batch-size N] [--lr LR]
                [--lr-schedule SCHEDULE] [--warmup E] [--label-smoothing S]
                [--mixup ALPHA] [--momentum M] [--weight-decay W]
@@ -345,7 +345,6 @@ optional arguments:
   --model-config CONF, -c CONF
                         model configs: classic | fanin | grp-fanin | grp-
                         fanout(default: classic)
-  --num-classes N       number of classes in the dataset
   -j N, --workers N     number of data loading workers (default: 5)
   --epochs N            number of total epochs to run
   --run-epochs N        run only N epochs, used for checkpointing runs
@@ -400,7 +399,7 @@ To use your own dataset, divide it in directories as in the following scheme:
  - Training images - `train/<class id>/<image>`
  - Validation images - `val/<class id>/<image>`
 
-If your dataset's has number of classes different than 1000, you need to pass `--num-classes N` flag to the training script.
+If your dataset's has number of classes different than 1000, you need to pass `--num_classes N` flag to the training script.
 
 ### Training process
 
diff --git a/PyTorch/Classification/ConvNets/se-resnext101-32x4d/README.md b/PyTorch/Classification/ConvNets/se-resnext101-32x4d/README.md