环境依赖:
pytorch 0.4以上
tensorboardX: pip install tensorboardX、pip install tensorflow
在项目代码中加入tensorboardX的记录代码,生成文件并返回到浏览器中显示可视化结果。
官方示例:

默认设置是在根目录下生成一个runs文件夹,里面存储summary的信息。
在runs的同级目录下命令行中输入:
tensorboard --logdir runs (不是输tensorboardX)
会出来一个网站,复制到浏览器即可可视化loss,acc,lr等数据的变化过程.
举例说明pytorch中设置summary的方式:
1 import argparse
2 import os
3 import numpy as np
4 from tqdm import tqdm
5
6 from mypath import Path
7 from dataloaders import make_data_loader
8 from modeling.sync_batchnorm.replicate import patch_replication_callback
9 from modeling.deeplab import *
10 from modeling.psp_net import *
11 from utils.loss import SegmentationLosses
12 from utils.calculate_weights import calculate_weigths_labels
13 from utils.lr_scheduler import LR_Scheduler
14 from utils.saver import Saver
15 from utils.summaries import TensorboardSummary
16 from utils.metrics import Evaluator
17 from utils.misc import CrossEntropyLoss2d
18
19 class Trainer(object):
20 def __init__(self, args):
21 self.args = args
22
23 # Define Saver
24 self.saver = Saver(args)
25 self.saver.save_experiment_config()
26 # Define Tensorboard Summary,是pytorch中的tensorboardX.
27 self.summary = TensorboardSummary(self.saver.experiment_dir)
28 self.writer = self.summary.create_summary()
29
30 # Define Dataloader,根据不同的数据集修改此加载器
31 kwargs = {'num_workers': args.workers, 'pin_memory': True}
32 self.train_loader, self.val_loader, self.test_loader, self.nclass = make_data_loader(args, **kwargs)
33
34 # Define network,需要修改的是类的数量.
35 model = PSPNet(num_classes=self.nclass).cuda()
36 #源代码的deeplabv3+模型
37 # model = DeepLab(num_classes=self.nclass,
38 # backbone=args.backbone,
39 # output_stride=args.out_stride,
40 # sync_bn=args.sync_bn,
41 # freeze_bn=args.freeze_bn)
42
43 # train_params = [{'params': model.get_1x_lr_params(), 'lr': args.lr},
44 # {'params': model.get_10x_lr_params(), 'lr': args.lr * 10}]
45
46 # Define Optimizer(deeplabv3+)
47 # optimizer = torch.optim.SGD(train_params, momentum=args.momentum,
48 # weight_decay=args.weight_decay, nesterov=args.nesterov)
49 #PSPNET,修改的优化器部分,需要注意的是lr需要用args.lr来表示
50 optimizer = torch.optim.SGD([
51 {'params': [param for name, param in model.named_parameters() if name[-4:] == 'bias'],
52 'lr': 2 * args.lr},
53 {'params': [param for name, param in model.named_parameters() if name[-4:] != 'bias'],
54 'lr': args.lr, 'weight_decay': args.weight_decay}
55 ], momentum=args.momentum, nesterov=True)
56
57
58
59
60 # Define Criterion,在util中有Loss文件对此重新定义,调用时候用self.criterion
61 # whether to use class balanced weights
62 if args.use_balanced_weights:
63 classes_weights_path = os.path.join(Path.db_root_dir(args.dataset), args.dataset+'_classes_weights.npy')
64 if os.path.isfile(classes_weights_path):
65 weight = np.load(classes_weights_path)
66 else:
67 weight = calculate_weigths_labels(args.dataset, self.train_loader, self.nclass)
68 weight = torch.from_numpy(weight.astype(np.float32))
69 else:
70 weight = None
71 self.criterion = SegmentationLosses(weight=weight, cuda=args.cuda).build_loss(mode=args.loss_type)
72 self.model, self.optimizer = model, optimizer
73
74 # Define Evaluator
75 self.evaluator = Evaluator(self.nclass)
76 # Define lr scheduler
77 self.scheduler = LR_Scheduler(args.lr_scheduler, args.lr,
78 args.epochs, len(self.train_loader))
79
80 # Using cuda
81 if args.cuda:
82 self.model = torch.nn.DataParallel(self.model, device_ids=self.args.gpu_ids)
83 patch_replication_callback(self.model)
84 self.model = self.model.cuda()
85
86 # Resuming checkpoint
87 self.best_pred = 0.0
88 if args.resume is not None:
89 if not os.path.isfile(args.resume):
90 raise RuntimeError("=> no checkpoint found at '{}'" .format(args.resume))
91 checkpoint = torch.load(args.resume)
92 args.start_epoch = checkpoint['epoch']
93 if args.cuda:
94 self.model.module.load_state_dict(checkpoint['state_dict'])
95 else:
96 self.model.load_state_dict(checkpoint['state_dict'])
97 if not args.ft:
98 self.optimizer.load_state_dict(checkpoint['optimizer'])
99 self.best_pred = checkpoint['best_pred']
100 print("=> loaded checkpoint '{}' (epoch {})"
101 .format(args.resume, checkpoint['epoch']))
102
103 # Clear start epoch if fine-tuning
104 if args.ft:
105 args.start_epoch = 0
106 #训练函数
107 def training(self, epoch):
108 train_loss = 0.0
109 self.model.train()
110 tbar = tqdm(self.train_loader)
111 num_img_tr = len(self.train_loader)
112 #源代码deeplabv3+的加载方式,换成pspnet时需要进行loss的修改
113 # for inputs_slice, gts_slice in zip(inputs, gts):
114 # inputs_slice = Variable(inputs_slice).cuda()
115 # gts_slice = Variable(gts_slice).cuda()
116 #
117 # optimizer.zero_grad()
118 # outputs, aux = net(inputs_slice)
119 # assert outputs.size()[2:] == gts_slice.size()[1:]
120 # assert outputs.size()[1] == voc.num_classes
121 #
122 # main_loss = criterion(outputs, gts_slice)
123 # aux_loss = criterion(aux, gts_slice)
124 # loss = main_loss + 0.4 * aux_loss
125 # loss.backward()
126 # optimizer.step()
127 #
128 # train_main_loss.update(main_loss.item(), slice_batch_pixel_size)
129 # train_aux_loss.update(aux_loss.item(), slice_batch_pixel_size)
130 for i, sample in enumerate(tbar):
131 image, target = sample['image'], sample['label']
132 if self.args.cuda:
133 image, target = image.cuda(), target.cuda()
134 self.scheduler(self.optimizer, i, epoch, self.best_pred)
135
136 self.optimizer.zero_grad()
137 outputs, aux = self.model(image)#output即为标签
138 assert outputs.size()[2:] == target.size()[1:]
139 assert outputs.size()[1] == self.nclass
140 loss = self.criterion(outputs, target)
141 #criterion
142 loss.backward()
143
144 #deeplabv3+设置
145 # self.optimizer.zero_grad()
146 # output = self.model(image)
147 # loss = self.criterion(output, target)
148 # loss.backward()
149 self.optimizer.step()
150 train_loss += loss.item()
151 tbar.set_description('Train loss: %.3f' % (train_loss / (i + 1)))
152 self.writer.add_scalar('train/total_loss_iter', loss.item(), i + num_img_tr * epoch)
153
154 # Show 10 * 3 inference results each epoch
155 if i % (num_img_tr // 10) == 0:
156 global_step = i + num_img_tr * epoch
157 self.summary.visualize_image(self.writer, self.args.dataset, image, target, outputs, global_step)
158
159 self.writer.add_scalar('train/total_loss_epoch', train_loss, epoch)
160 print('[Epoch: %d, numImages: %5d]' % (epoch, i * self.args.batch_size + image.data.shape[0]))
161 print('Loss: %.3f' % train_loss)
162
163 if self.args.no_val:
164 # save checkpoint every epoch
165 is_best = False
166 self.saver.save_checkpoint({
167 'epoch': epoch + 1,
168 'state_dict': self.model.module.state_dict(),
169 'optimizer': self.optimizer.state_dict(),
170 'best_pred': self.best_pred,
171 }, is_best)
172
173
174 def validation(self, epoch):
175 self.model.eval()
176 self.evaluator.reset()
177 tbar = tqdm(self.val_loader, desc='\r')
178 test_loss = 0.0
179 for i, sample in enumerate(tbar):
180 image, target = sample['image'], sample['label']
181 if self.args.cuda:
182 image, target = image.cuda(), target.cuda()
183 with torch.no_grad():
184 output = self.model(image)
185 loss = self.criterion(output, target)
186 test_loss += loss.item()
187 tbar.set_description('Test loss: %.3f' % (test_loss / (i + 1)))
188 pred = output.data.cpu().numpy()
189 target = target.cpu().numpy()
190 pred = np.argmax(pred, axis=1)
191 # Add batch sample into evaluator
192 self.evaluator.add_batch(target, pred)
193
194 # Fast test during the training
195 Acc = self.evaluator.Pixel_Accuracy()
196 Acc_class = self.evaluator.Pixel_Accuracy_Class()
197 mIoU = self.evaluator.Mean_Intersection_over_Union()
198 FWIoU = self.evaluator.Frequency_Weighted_Intersection_over_Union()
199 self.writer.add_scalar('val/total_loss_epoch', test_loss, epoch)
200 self.writer.add_scalar('val/mIoU', mIoU, epoch)
201 self.writer.add_scalar('val/Acc', Acc, epoch)
202 self.writer.add_scalar('val/Acc_class', Acc_class, epoch)
203 self.writer.add_scalar('val/fwIoU', FWIoU, epoch)
204 print('Validation:')
205 print('[Epoch: %d, numImages: %5d]' % (epoch, i * self.args.batch_size + image.data.shape[0]))
206 print("Acc:{}, Acc_class:{}, mIoU:{}, fwIoU: {}".format(Acc, Acc_class, mIoU, FWIoU))
207 print('Loss: %.3f' % test_loss)
208
209 new_pred = mIoU
210 if new_pred > self.best_pred:
211 is_best = True
212 self.best_pred = new_pred
213 self.saver.save_checkpoint({
214 'epoch': epoch + 1,
215 'state_dict': self.model.module.state_dict(),
216 'optimizer': self.optimizer.state_dict(),
217 'best_pred': self.best_pred,
218 }, is_best)
219
220 def main():
221 # 超参数的设置
222 parser = argparse.ArgumentParser(description="PyTorch DeeplabV3Plus Training")
223 # 提取特征的卷积网络的设置
224 parser.add_argument('--backbone', type=str, default='resnet',
225 choices=['resnet', 'xception', 'drn', 'mobilenet'],
226 help='backbone name (default: resnet)')
227 parser.add_argument('--out-stride', type=int, default=16,
228 help='network output stride (default: 8)')
229 parser.add_argument('--dataset', type=str, default='pascal',
230 choices=['pascal', 'coco', 'cityscapes'],
231 help='dataset name (default: pascal)')
232 parser.add_argument('--use-sbd', action='store_true', default=False,
233 help='whether to use SBD dataset (default: True)')
234 parser.add_argument('--workers', type=int, default=4,
235 metavar='N', help='dataloader threads')
236 parser.add_argument('--base-size', type=int, default=513,
237 help='base image size')
238 # 在cuda内存不足时可修改此参数,原参数为513
239 parser.add_argument('--crop-size', type=int, default=256,
240 help='crop image size')
241 parser.add_argument('--sync-bn', type=bool, default=None,
242 help='whether to use sync bn (default: auto)')
243 parser.add_argument('--freeze-bn', type=bool, default=False,
244 help='whether to freeze bn parameters (default: False)')
245 parser.add_argument('--loss-type', type=str, default='ce',
246 choices=['ce', 'focal'],
247 help='loss func type (default: ce)')
248 # training hyper params
249 parser.add_argument('--epochs', type=int, default=None, metavar='N',
250 help='number of epochs to train (default: auto)')
251 parser.add_argument('--start_epoch', type=int, default=0,
252 metavar='N', help='start epochs (default:0)')
253 parser.add_argument('--batch-size', type=int, default=None,
254 metavar='N', help='input batch size for \
255 training (default: auto)')
256 parser.add_argument('--test-batch-size', type=int, default=None,
257 metavar='N', help='input batch size for \
258 testing (default: auto)')
259 parser.add_argument('--use-balanced-weights', action='store_true', default=False,
260 help='whether to use balanced weights (default: False)')
261 # optimizer params
262 parser.add_argument('--lr', type=float, default=None, metavar='LR',
263 help='learning rate (default: auto)')
264 parser.add_argument('--lr-scheduler', type=str, default='poly',
265 choices=['poly', 'step', 'cos'],
266 help='lr scheduler mode: (default: poly)')
267 parser.add_argument('--momentum', type=float, default=0.9,
268 metavar='M', help='momentum (default: 0.9)')
269 parser.add_argument('--weight-decay', type=float, default=5e-4,
270 metavar='M', help='w-decay (default: 5e-4)')
271 parser.add_argument('--nesterov', action='store_true', default=False,
272 help='whether use nesterov (default: False)')
273 # cuda, seed and logging
274 parser.add_argument('--no-cuda', action='store_true', default=
275 False, help='disables CUDA training')
276 parser.add_argument('--gpu-ids', type=str, default='0',
277 help='use which gpu to train, must be a \
278 comma-separated list of integers only (default=0)')
279 parser.add_argument('--seed', type=int, default=1, metavar='S',
280 help='random seed (default: 1)')
281 # checking point
282 parser.add_argument('--resume', type=str, default=None,
283 help='put the path to resuming file if needed')
284 parser.add_argument('--checkname', type=str, default=None,
285 help='set the checkpoint name')
286 # finetuning pre-trained models
287 parser.add_argument('--ft', action='store_true', default=False,
288 help='finetuning on a different dataset')
289 # evaluation option
290 parser.add_argument('--eval-interval', type=int, default=1,
291 help='evaluuation interval (default: 1)')
292 parser.add_argument('--no-val', action='store_true', default=False,
293 help='skip validation during training')
294
295 args = parser.parse_args()
296 args.cuda = not args.no_cuda and torch.cuda.is_available()
297 if args.cuda:
298 try:
299 args.gpu_ids = [int(s) for s in args.gpu_ids.split(',')]
300 except ValueError:
301 raise ValueError('Argument --gpu_ids must be a comma-separated list of integers only')
302
303 if args.sync_bn is None:
304 if args.cuda and len(args.gpu_ids) > 1:
305 args.sync_bn = True
306 else:
307 args.sync_bn = False
308
309 # 默认的 epochs, batch_size and lr
310 if args.epochs is None:
311 epoches = {
312 'coco': 30,
313 'cityscapes': 200,
314 'pascal': 50,
315 # 50
316 }
317 args.epochs = epoches[args.dataset.lower()]
318
319 if args.batch_size is None:
320 args.batch_size = 2 * len(args.gpu_ids)
321
322 # 4*
323
324 if args.test_batch_size is None:
325 args.test_batch_size = args.batch_size
326
327 if args.lr is None:
328 lrs = {
329 'coco': 0.1,
330 'cityscapes': 0.01,
331 'pascal': 0.007,
332 }
333 args.lr = lrs[args.dataset.lower()] / (2 * len(args.gpu_ids)) * args.batch_size
334
335
336 if args.checkname is None:
337 args.checkname = 'deeplab-'+str(args.backbone)
338 print(args)
339 torch.manual_seed(args.seed)
340 trainer = Trainer(args)
341 print('Starting Epoch:', trainer.args.start_epoch)
342 print('Total Epoches:', trainer.args.epochs)
343 for epoch in range(trainer.args.start_epoch, trainer.args.epochs):
344 trainer.training(epoch)
345 if not trainer.args.no_val and epoch % args.eval_interval == (args.eval_interval - 1):
346 trainer.validation(epoch)
347
348 trainer.writer.close()
349
350 if __name__ == "__main__":
351 main()
来源:https://www.cnblogs.com/ywheunji/p/10712620.html