Skip to content

Commit 0107cf2

Browse files
committed
Merge: [ConvNets/PyT] Fixed issue with unexpected keys in state_dict
2 parents f693f4e + 5843f4e commit 0107cf2

9 files changed

Lines changed: 77 additions & 57 deletions

File tree

PyTorch/Classification/ConvNets/classify.py

Lines changed: 31 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -34,6 +34,7 @@
3434
efficientnet_quant_b4,
3535
)
3636

37+
3738
def available_models():
3839
models = {
3940
m.name: m
@@ -51,6 +52,7 @@ def available_models():
5152
}
5253
return models
5354

55+
5456
def add_parser_arguments(parser):
5557
model_names = available_models().keys()
5658
parser.add_argument("--image-size", default="224", type=int)
@@ -98,27 +100,41 @@ def load_jpeg_from_file(path, image_size, cuda=True):
98100

99101

100102
def check_quant_weight_correctness(checkpoint_path, model):
101-
state_dict = torch.load(checkpoint_path, map_location=torch.device('cpu'))
102-
state_dict = {k[len("module."):] if k.startswith("module.") else k: v for k, v in state_dict.items()}
103-
quantizers_sd_keys = {f'{n[0]}._amax' for n in model.named_modules() if 'quantizer' in n[0]}
103+
state_dict = torch.load(checkpoint_path, map_location=torch.device("cpu"))
104+
state_dict = {
105+
k[len("module.") :] if k.startswith("module.") else k: v
106+
for k, v in state_dict.items()
107+
}
108+
quantizers_sd_keys = {
109+
f"{n[0]}._amax" for n in model.named_modules() if "quantizer" in n[0]
110+
}
104111
sd_all_keys = quantizers_sd_keys | set(model.state_dict().keys())
105-
assert set(state_dict.keys()) == sd_all_keys, (f'Passed quantized architecture, but following keys are missing in '
106-
f'checkpoint: {list(sd_all_keys - set(state_dict.keys()))}')
112+
assert set(state_dict.keys()) == sd_all_keys, (
113+
f"Passed quantized architecture, but following keys are missing in "
114+
f"checkpoint: {list(sd_all_keys - set(state_dict.keys()))}"
115+
)
107116

108117

109118
def main(args, model_args):
110119
imgnet_classes = np.array(json.load(open("./LOC_synset_mapping.json", "r")))
111-
model = available_models()[args.arch](**model_args.__dict__)
112-
if args.arch in ['efficientnet-quant-b0', 'efficientnet-quant-b4']:
120+
try:
121+
model = available_models()[args.arch](**model_args.__dict__)
122+
except RuntimeError as e:
123+
print_in_box(
124+
"Error when creating model, did you forget to run checkpoint2model script?"
125+
)
126+
raise e
127+
128+
if args.arch in ["efficientnet-quant-b0", "efficientnet-quant-b4"]:
113129
check_quant_weight_correctness(model_args.pretrained_from_file, model)
114-
130+
115131
if not args.cpu:
116132
model = model.cuda()
117133
model.eval()
118134

119135
input = load_jpeg_from_file(args.image, args.image_size, cuda=not args.cpu)
120136

121-
with torch.no_grad(), autocast(enabled = args.precision == "AMP"):
137+
with torch.no_grad(), autocast(enabled=args.precision == "AMP"):
122138
output = torch.nn.functional.softmax(model(input), dim=1)
123139

124140
output = output.float().cpu().view(-1).numpy()
@@ -129,6 +145,12 @@ def main(args, model_args):
129145
print(f"{c}: {100*v:.1f}%")
130146

131147

148+
def print_in_box(msg):
149+
print("#" * (len(msg) + 10))
150+
print(f"#### {msg} ####")
151+
print("#" * (len(msg) + 10))
152+
153+
132154
if __name__ == "__main__":
133155
parser = argparse.ArgumentParser(description="PyTorch ImageNet Classification")
134156

PyTorch/Classification/ConvNets/efficientnet/README.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -369,7 +369,7 @@ To use your own dataset, divide it into directories. For example:
369369
- Training images - `train/<class id>/<image>`
370370
- Validation images - `val/<class id>/<image>`
371371

372-
If your dataset has a number of classes different than 1000, you need to pass the `--num-classes N` flag to the training script.
372+
If your dataset has a number of classes different than 1000, you need to pass the `--num_classes N` flag to the training script.
373373

374374
### Training process
375375

PyTorch/Classification/ConvNets/image_classification/logger.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -409,13 +409,13 @@ def __init__(self, logger):
409409

410410

411411
class ValidationMetrics(Metrics):
412-
def __init__(self, logger, prefix):
412+
def __init__(self, logger, prefix, topk):
413413
super().__init__(logger)
414414
if self.logger is not None:
415415
self.map = {
416416
"loss": [f"{prefix}.loss"],
417417
"top1": [f"{prefix}.top1"],
418-
"top5": [f"{prefix}.top5"],
418+
f"top{topk}": [f"{prefix}.top{topk}"],
419419
"compute_ips": [f"{prefix}.compute_ips"],
420420
"total_ips": [f"{prefix}.total_ips"],
421421
"data_time": [f"{prefix}.data_time"],
@@ -433,7 +433,7 @@ def __init__(self, logger, prefix):
433433
metadata=Metrics.ACC_METADATA,
434434
)
435435
logger.register_metric(
436-
f"{prefix}.top5",
436+
f"{prefix}.top{topk}",
437437
ACC_METER(),
438438
verbosity=dllogger.Verbosity.DEFAULT,
439439
metadata=Metrics.ACC_METADATA,

PyTorch/Classification/ConvNets/image_classification/models/model.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -47,6 +47,7 @@ def torchhub_docstring(name: str):
4747
pretrained (bool, True): If True, returns a model pretrained on IMAGENET dataset.
4848
"""
4949

50+
5051
class EntryPoint:
5152
@staticmethod
5253
def create(name: str, model: Model):
@@ -119,7 +120,7 @@ def reshape(t, conv):
119120
state_dict_key_map_fn(k): v for k, v in state_dict.items()
120121
}
121122

122-
if hasattr(model, "ngc_checkpoint_remap"):
123+
if pretrained and hasattr(model, "ngc_checkpoint_remap"):
123124
remap_fn = model.ngc_checkpoint_remap(url=self.model.checkpoint_url)
124125
state_dict = {remap_fn(k): v for k, v in state_dict.items()}
125126

PyTorch/Classification/ConvNets/image_classification/training.py

Lines changed: 9 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -252,7 +252,7 @@ def train(
252252
return interrupted
253253

254254

255-
def validate(infer_fn, val_loader, log_fn, prof=-1, with_loss=True):
255+
def validate(infer_fn, val_loader, log_fn, prof=-1, with_loss=True, topk=5):
256256
top1 = log.AverageMeter()
257257
# switch to evaluate mode
258258

@@ -270,23 +270,18 @@ def validate(infer_fn, val_loader, log_fn, prof=-1, with_loss=True):
270270
output = infer_fn(input)
271271

272272
with torch.no_grad():
273-
prec1, prec5 = utils.accuracy(output.data, target, topk=(1, 5))
273+
precs = utils.accuracy(output.data, target, topk=(1, topk))
274274

275275
if torch.distributed.is_initialized():
276276
if with_loss:
277277
reduced_loss = utils.reduce_tensor(loss.detach())
278-
prec1 = utils.reduce_tensor(prec1)
279-
prec5 = utils.reduce_tensor(prec5)
278+
precs = map(utils.reduce_tensor, precs)
280279
else:
281280
if with_loss:
282281
reduced_loss = loss.detach()
283282

284-
prec1 = prec1.item()
285-
prec5 = prec5.item()
286-
infer_result = {
287-
"top1": (prec1, bs),
288-
"top5": (prec5, bs),
289-
}
283+
precs = map(lambda t: t.item(), precs)
284+
infer_result = {f"top{k}": (p, bs) for k, p in zip((1, topk), precs)}
290285

291286
if with_loss:
292287
infer_result["loss"] = (reduced_loss.item(), bs)
@@ -295,7 +290,7 @@ def validate(infer_fn, val_loader, log_fn, prof=-1, with_loss=True):
295290

296291
it_time = time.time() - end
297292

298-
top1.record(prec1, bs)
293+
top1.record(infer_result["top1"][0], bs)
299294

300295
log_fn(
301296
compute_ips=utils.calc_ips(bs, it_time - data_time),
@@ -332,6 +327,7 @@ def train_loop(
332327
checkpoint_dir="./",
333328
checkpoint_filename="checkpoint.pth.tar",
334329
keep_last_n_checkpoints=0,
330+
topk=5,
335331
):
336332
checkpointer = utils.Checkpointer(
337333
last_filename=checkpoint_filename,
@@ -340,7 +336,7 @@ def train_loop(
340336
)
341337
train_metrics = TrainingMetrics(logger)
342338
val_metrics = {
343-
k: ValidationMetrics(logger, k) for k in trainer.validation_steps().keys()
339+
k: ValidationMetrics(logger, k, topk) for k in trainer.validation_steps().keys()
344340
}
345341
training_step = trainer.train_step
346342

@@ -389,6 +385,7 @@ def train_loop(
389385
data_iter,
390386
val_metrics[k].log,
391387
prof=prof,
388+
topk=topk,
392389
)
393390

394391
if k == "val":

PyTorch/Classification/ConvNets/main.py

Lines changed: 13 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -29,7 +29,9 @@
2929
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
3030
import os
3131

32-
os.environ["KMP_AFFINITY"] = "disabled" # We need to do this before importing anything else as a workaround for this bug: https://github.com/pytorch/pytorch/issues/28389
32+
os.environ[
33+
"KMP_AFFINITY"
34+
] = "disabled" # We need to do this before importing anything else as a workaround for this bug: https://github.com/pytorch/pytorch/issues/28389
3335

3436
import argparse
3537
import random
@@ -290,7 +292,7 @@ def add_parser_arguments(parser, skip_arch=False):
290292
"Gather N last checkpoints throughout the training,"
291293
" without this flag only best and last checkpoints will be stored. "
292294
"Use -1 for all checkpoints"
293-
)
295+
),
294296
)
295297

296298
parser.add_argument(
@@ -343,13 +345,6 @@ def add_parser_arguments(parser, skip_arch=False):
343345
choices=[None, "autoaugment"],
344346
help="augmentation method",
345347
)
346-
parser.add_argument(
347-
"--num-classes",
348-
type=int,
349-
default=None,
350-
required=False,
351-
help="number of classes",
352-
)
353348

354349
parser.add_argument(
355350
"--gpu-affinity",
@@ -359,6 +354,13 @@ def add_parser_arguments(parser, skip_arch=False):
359354
choices=[am.name for am in AffinityMode],
360355
)
361356

357+
parser.add_argument(
358+
"--topk",
359+
type=int,
360+
default=5,
361+
required=False,
362+
)
363+
362364

363365
def prepare_for_training(args, model_args, model_arch):
364366
args.distributed = False
@@ -389,7 +391,7 @@ def prepare_for_training(args, model_args, model_arch):
389391

390392
def _worker_init_fn(id):
391393
# Worker process should inherit its affinity from parent
392-
affinity = os.sched_getaffinity(0)
394+
affinity = os.sched_getaffinity(0)
393395
print(f"Process {args.local_rank} Worker {id} set affinity to: {affinity}")
394396

395397
np.random.seed(seed=args.seed + args.local_rank + id)
@@ -645,6 +647,7 @@ def main(args, model_args, model_arch):
645647
checkpoint_dir=args.workspace,
646648
checkpoint_filename=args.checkpoint_filename,
647649
keep_last_n_checkpoints=args.gather_checkpoints,
650+
topk=args.topk,
648651
)
649652
exp_duration = time.time() - exp_start_time
650653
if not torch.distributed.is_initialized() or torch.distributed.get_rank() == 0:

PyTorch/Classification/ConvNets/resnet50v1.5/README.md

Lines changed: 8 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -293,7 +293,7 @@ unzip resnet50_pyt_amp_20.06.0.zip
293293

294294
To run inference on ImageNet, run:
295295

296-
`python ./main.py --arch resnet50 --evaluate --epochs 1 --pretrained-weights nvidia_resnet50_200821.pth.tar -b <batch size> <path to imagenet>`
296+
`python ./main.py --arch resnet50 --evaluate --epochs 1 --pretrained-from-file nvidia_resnet50_200821.pth.tar -b <batch size> <path to imagenet>`
297297

298298
To run inference on JPEG image using pretrained weights:
299299

@@ -331,12 +331,12 @@ To see the full list of available options and their descriptions, use the `-h` o
331331

332332
```
333333
usage: main.py [-h] [--data-backend BACKEND] [--arch ARCH]
334-
[--model-config CONF] [--num-classes N] [-j N] [--epochs N]
334+
[--model-config CONF] [-j N] [--epochs N]
335335
[--run-epochs N] [-b N] [--optimizer-batch-size N] [--lr LR]
336336
[--lr-schedule SCHEDULE] [--warmup E] [--label-smoothing S]
337337
[--mixup ALPHA] [--momentum M] [--weight-decay W]
338338
[--bn-weight-decay] [--nesterov] [--print-freq N]
339-
[--resume PATH] [--pretrained-weights PATH]
339+
[--resume PATH] [--pretrained-from-file PATH]
340340
[--static-loss-scale STATIC_LOSS_SCALE] [--dynamic-loss-scale]
341341
[--prof N] [--amp] [--seed SEED] [--gather-checkpoints]
342342
[--raport-file RAPORT_FILE] [--evaluate] [--training-only]
@@ -362,7 +362,6 @@ optional arguments:
362362
--model-config CONF, -c CONF
363363
model configs: classic | fanin | grp-fanin | grp-
364364
fanout(default: classic)
365-
--num-classes N number of classes in the dataset
366365
-j N, --workers N number of data loading workers (default: 5)
367366
--epochs N number of total epochs to run
368367
--run-epochs N run only N epochs, used for checkpointing runs
@@ -385,7 +384,7 @@ optional arguments:
385384
--nesterov use nesterov momentum, (default: false)
386385
--print-freq N, -p N print frequency (default: 10)
387386
--resume PATH path to latest checkpoint (default: none)
388-
--pretrained-weights PATH
387+
--pretrained-from-file PATH
389388
load weights from here
390389
--static-loss-scale STATIC_LOSS_SCALE
391390
Static loss scale, positive power of 2 values can
@@ -418,7 +417,7 @@ To use your own dataset, divide it in directories as in the following scheme:
418417
- Training images - `train/<class id>/<image>`
419418
- Validation images - `val/<class id>/<image>`
420419

421-
If your dataset's has number of classes different than 1000, you need to pass `--num-classes N` flag to the training script.
420+
If your dataset's has number of classes different than 1000, you need to pass `--num_classes N` flag to the training script.
422421

423422
### Training process
424423

@@ -438,7 +437,7 @@ Metrics gathered through training:
438437

439438
To restart training from checkpoint use `--resume` option.
440439

441-
To start training from pretrained weights (e.g. downloaded from NGC) use `--pretrained-weights` option.
440+
To start training from pretrained weights (e.g. downloaded from NGC) use `--pretrained-from-file` option.
442441

443442
The difference between those two is that the pretrained weights contain only model weights,
444443
and checkpoints, apart from model weights, contain optimizer state, LR scheduler state.
@@ -476,7 +475,7 @@ Then run classification script:
476475

477476
You can also run ImageNet validation on pretrained weights:
478477

479-
`python ./main.py --arch resnet50 --evaluate --epochs 1 --pretrained-weights <path to pretrained weights> -b <batch size> <path to imagenet>`
478+
`python ./main.py --arch resnet50 --evaluate --epochs 1 --pretrained-from-file <path to pretrained weights> -b <batch size> <path to imagenet>`
480479

481480
#### NGC Pretrained weights:
482481

@@ -489,7 +488,7 @@ unzip resnet50_pyt_amp_20.06.0.zip
489488
```
490489
To run inference on ImageNet, run:
491490

492-
`python ./main.py --arch resnet50 --evaluate --epochs 1 --pretrained-weights nvidia_resnet50_200821.pth.tar -b <batch size> <path to imagenet>`
491+
`python ./main.py --arch resnet50 --evaluate --epochs 1 --pretrained-from-file nvidia_resnet50_200821.pth.tar -b <batch size> <path to imagenet>`
493492

494493
To run inference on JPEG image using pretrained weights:
495494

PyTorch/Classification/ConvNets/resnext101-32x4d/README.md

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -314,7 +314,7 @@ To see the full list of available options and their descriptions, use the `-h` o
314314

315315
```
316316
usage: main.py [-h] [--data-backend BACKEND] [--arch ARCH]
317-
[--model-config CONF] [--num-classes N] [-j N] [--epochs N]
317+
[--model-config CONF] [-j N] [--epochs N]
318318
[--run-epochs N] [-b N] [--optimizer-batch-size N] [--lr LR]
319319
[--lr-schedule SCHEDULE] [--warmup E] [--label-smoothing S]
320320
[--mixup ALPHA] [--momentum M] [--weight-decay W]
@@ -345,7 +345,6 @@ optional arguments:
345345
--model-config CONF, -c CONF
346346
model configs: classic | fanin | grp-fanin | grp-
347347
fanout(default: classic)
348-
--num-classes N number of classes in the dataset
349348
-j N, --workers N number of data loading workers (default: 5)
350349
--epochs N number of total epochs to run
351350
--run-epochs N run only N epochs, used for checkpointing runs
@@ -400,7 +399,7 @@ To use your own dataset, divide it in directories as in the following scheme:
400399
- Training images - `train/<class id>/<image>`
401400
- Validation images - `val/<class id>/<image>`
402401

403-
If your dataset's has number of classes different than 1000, you need to pass `--num-classes N` flag to the training script.
402+
If your dataset's has number of classes different than 1000, you need to pass `--num_classes N` flag to the training script.
404403

405404
### Training process
406405

0 commit comments

Comments
 (0)