The pytorch implementation of PointNet is from Xu Yan' GitHub
|
|
https://oboiko.medium.com/distributed-training-with-pytorch-d1fa5f57b40
https://github.com/laisimiao/classification-cifar10-pytorch/blob/master/main_ddp.py
(1) initialize and local_rank (2) DistributedSampler (3)-1,2 DistributedSampler (4) DDP
(5) output: torch.distributed.all_reduce torch.distributed.all_gather
import torch
import torch.nn as nn
import torch.nn.functional as F
#import torchvision
from torch.utils.data import Dataset, DataLoader, random_split
import torchvision.transforms as transforms
from torch.optim import lr_scheduler
import numpy as np
import math
import os, time
import psutil #memory
import h5py
import pandas as pd
import torch.utils.data as data
from sklearn import preprocessing
from sklearn.preprocessing import StandardScaler
#from sklearn.preprocessing import MinMaxScaler
#torch-summary package to show model summary
from torchsummary import summary
os.environ[ 'MPLCONFIGDIR' ] = '/hpcfs/juno/junogpu/liuzhen/tmp/'
import matplotlib as mpl
import matplotlib.pyplot as plt
from matplotlib.ticker import StrMethodFormatter
/hpcfs/juno/junogpu/liuzhen/software/envs/cnn/lib/python3.8/site-packages/tqdm/auto.py:22: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html from .autonotebook import tqdm as notebook_tqdm
# PointNet++ model from https://github.com/yanx27/Pointnet_Pointnet2_pytorch
from point2 import get_model
torch.cuda.is_available()
torch.cuda.empty_cache()
torch.cuda.device_count()
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(device)
cuda
# input files
csvfile = 'input/example_JUNO_Numu_CC.csv'
h5file = 'input/example_JUNO_Numu_CC.h5'
network = 'PointNet2'
# output directory
save_model_path = './results'
# input chchannels: (x,y,z)+ N*features
Nfeatures = 2 #6
# output-->#(dx,dy,dz)
Nlabels = 3
n_workers = 4
log_interval =100
NUM_EPOCHS = 15 #30
BATCH_SIZE = 16
LEARNING_RATE_Max = 1e-3
DIV_Factor = 1e4
GRAD_CLIP = 0.3
WEIGHT_DECAY= 1e-5
80% for trainning, 20% for tsetting
#https://github.com/Zepeng/LS_ML.git
class H5Dataset(data.Dataset):
def __init__(self, h5_path, csv_path, n_channels=2, use_transform=None):
self.to_tensor = transforms.ToTensor()
csv_info = pd.read_csv(csv_path, header=None)
self.groupname = np.asarray(csv_info.iloc[:,0])
self.datainfo = np.asarray(csv_info.iloc[:,1])
#https://discuss.pytorch.org/t/dataloader-when-num-worker-0-there-is-bug/25643/16
self.file_path = h5_path
self.h5file = None
self.n_channels = n_channels
self.use_transform = use_transform
def __len__(self):
return len(self.datainfo)
def __getitem__(self, idx):
if self.h5file is None:
self.h5file = h5py.File(self.file_path, 'r')
dset_entry = self.h5file[self.groupname[idx]][str(self.datainfo[idx])]
direction = dset_entry.attrs[u'direction']
pmtinfo = np.array(dset_entry)
pmtinfo_xyz=pmtinfo[:,0:3]
pmtinfo=pmtinfo[:,3:]
image = np.column_stack((pmtinfo_xyz,pmtinfo))
image = np.transpose(image)
image = torch.from_numpy(image).type(torch.float32)
if(self.n_channels == 2 ):
sel_idx = torch.LongTensor([0,1])
sel_idx = torch.cat((torch.LongTensor([0, 1, 2]), torch.add(sel_idx, 3)), -1)
sel_dim=0
image = image.index_select(sel_dim, sel_idx)
if self.use_transform is not None:
for feax in range(3):
image_fx = image[feax, :]
image[feax, :] = image_fx/19435
for fea in range(3, 3+self.n_channels):
image_f = image[fea, :]
#scaler = MinMaxScaler()
scaler = StandardScaler()
#scaler = RobustScaler()
model=scaler.fit(image_f.reshape(-1, 1))
image_scaled = model.transform(image_f.reshape(-1, 1))
image_scaled = torch.from_numpy(image_scaled).type(torch.float32)
image[fea,:]= torch.squeeze(image_scaled )
# convert (theta, phi) to (dx, dy, dz)
cos_theta = np.cos( np.array(direction[0]) )
sin_theta = np.sin(np.array(direction[0]))
sin_phi = np.sin( np.array(direction[1]) )
cos_phi = np.cos( np.array(direction[1]) )
dir_x=sin_theta*cos_phi
dir_y=sin_theta*sin_phi
dir_z=cos_theta
direction = np.column_stack((dir_x,dir_y,dir_z))
label = direction
label = torch.from_numpy(label).type(torch.float32)
label = torch.squeeze(label)
return image, label
full_data = H5Dataset(h5file,csvfile, n_channels=Nfeatures, use_transform="transform")
dataset_size = len(full_data)
print(f"Dataset size = {dataset_size:d}" )
print (f'\nmemory usage: {psutil.Process(os.getpid()).memory_info().rss / 1024 / 1024 / 1024 :.4f} GB' )
Dataset size = 4653 memory usage: 0.9944 GB
train_size = int(len(full_data)*0.8)
test_size = len(full_data) - train_size
train_dataset, test_dataset =random_split(full_data, [train_size, test_size])
# N.B. shuffle for train_dataset should be turned on!
train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE,
num_workers=n_workers, shuffle=False, drop_last=True, pin_memory=True)
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE,
num_workers=n_workers, shuffle=False, drop_last=True, pin_memory=True)
print("Size of the train_loader:", len(train_loader))
print("Size of the val_loader:", len(test_loader))
Size of the train_loader: 232 Size of the val_loader: 58
print(f"Dataset_size = {dataset_size:d}" )
print(f"Batch_size = {BATCH_SIZE:d}" )
print("Length of the train_loader:", len(train_loader))
print("Length of the val_loader:", len(test_loader))
#print("data shape: ",full_data[0][0].shape )
images, labels = next(iter(test_loader))
print("\n images.shape: ",images.shape)
print(" labels.shape: ",labels.shape)
#print(labels)
Dataset_size = 4653 Batch_size = 16 Length of the train_loader: 232 Length of the val_loader: 58 images.shape: torch.Size([16, 5, 17612]) labels.shape: torch.Size([16, 3])
model=get_model(Nlabels, Nfeatures)
if torch.cuda.device_count() > 1:
print("Use", torch.cuda.device_count(), "GPUs")
model = nn.DataParallel(model)
else:
print('use single GPU')
model.to(device);
## better to use: DistributedDataParallel
## run with: python -m torch.distributed.launch xxx.py
use single GPU
summary(model); #torch
================================================================= Layer (type:depth-idx) Param # ================================================================= ├─PointNetSetAbstractionMsg: 1-1 -- | └─ModuleList: 2-1 -- | | └─ModuleList: 3-1 3,360 | | └─ModuleList: 3-2 12,864 | | └─ModuleList: 3-3 19,040 | └─ModuleList: 2-2 -- | | └─ModuleList: 3-4 256 | | └─ModuleList: 3-5 512 | | └─ModuleList: 3-6 576 ├─PointNetSetAbstractionMsg: 1-2 -- | └─ModuleList: 2-3 -- | | └─ModuleList: 3-7 33,216 | | └─ModuleList: 3-8 91,008 | | └─ModuleList: 3-9 91,008 | └─ModuleList: 2-4 -- | | └─ModuleList: 3-10 512 | | └─ModuleList: 3-11 1,024 | | └─ModuleList: 3-12 1,024 ├─PointNetSetAbstraction: 1-3 -- | └─ModuleList: 2-5 -- | | └─Conv2d: 3-13 164,864 | | └─Conv2d: 3-14 131,584 | | └─Conv2d: 3-15 525,312 | └─ModuleList: 2-6 -- | | └─BatchNorm2d: 3-16 512 | | └─BatchNorm2d: 3-17 1,024 | | └─BatchNorm2d: 3-18 2,048 ├─Linear: 1-4 524,800 ├─BatchNorm1d: 1-5 1,024 ├─Dropout: 1-6 -- ├─Linear: 1-7 131,328 ├─BatchNorm1d: 1-8 512 ├─Dropout: 1-9 -- ├─Sequential: 1-10 -- | └─Linear: 2-7 771 | └─Tanh: 2-8 -- ================================================================= Total params: 1,738,179 Trainable params: 1,738,179 Non-trainable params: 0 =================================================================
criterion = nn.SmoothL1Loss(beta=1)
# 22.04.14 https://efficientdl.com/faster-deep-learning-in-pytorch-a-guide/
optimizer = torch.optim.AdamW(model.parameters(), lr=LEARNING_RATE_Max, weight_decay=WEIGHT_DECAY)
# one-cycle learning rate scheduler
scheduler = torch.optim.lr_scheduler.OneCycleLR(optimizer, max_lr=LEARNING_RATE_Max, final_div_factor=DIV_Factor, epochs=NUM_EPOCHS, steps_per_epoch=len(train_loader))
# 22.04.14 https://efficientdl.com/faster-deep-learning-in-pytorch-a-guide/
# Automatic Mixed Precision (AMP)
scaler = torch.cuda.amp.GradScaler()
def train(log_interval, model, device, train_loader, optimizer, criterion, epoch, NUM_EPOCHS, lr_sched, scaler, grad_clip=None):
# set model as training mode
model.train()
n_total_steps = len(train_loader)
losses = []
epoch_loss = 0
v_pre0 = []
v_tru0 = []
v_pre1 = []
v_tru1 = []
v_pre2 = []
v_tru2 = []
lrs = []
for i, (images, labels) in enumerate(train_loader):
images = images.to(device)
labels = labels.to(device)
# Forward pass
outputs = model(images)
loss_x = criterion(outputs[:,0], labels[:,0])
loss_y = criterion(outputs[:,1], labels[:,1])
loss_z = criterion(outputs[:,2], labels[:,2])
loss = loss_x+loss_y+loss_z
# Backward and optimize
optimizer.zero_grad()
#loss.backward()
scaler.scale(loss).backward()
# Gradient clipping
if grad_clip:
nn.utils.clip_grad_value_(model.parameters(), grad_clip)
#optimizer.step()
scaler.step(optimizer)
## Updates the scale for next iteration
scaler.update()
#loss = reduce_value(loss, average=True)
losses.append(loss.item())
epoch_loss =(np.mean(losses)).tolist()
#print([group.keys() for group in optimizer.param_groups])
### [dict_keys(['params', 'lr', 'betas', 'eps', 'weight_decay', 'amsgrad'])]
#print("i: ",i, " Learning Rate= ",optimizer.param_groups[0]["lr"])
lrs.append( optimizer.param_groups[0]["lr"] )
lr_sched.step()
for idx in range(outputs.size(0)):
v_pre0.append(outputs[idx][0].detach().cpu().numpy())
v_tru0.append(labels[idx][0].detach().cpu().numpy())
v_pre1.append(outputs[idx][1].detach().cpu().numpy())
v_tru1.append(labels[idx][1].detach().cpu().numpy())
v_pre2.append(outputs[idx][2].detach().cpu().numpy())
v_tru2.append(labels[idx][2].detach().cpu().numpy())
if (i+1) % log_interval == 0:
print (f'Train, Epoch [{epoch+1}/{NUM_EPOCHS}], Step [{i+1}/{n_total_steps}], LR={lrs[-1]:.2E}, Loss: {loss.item():.5f}')
return epoch_loss, lrs, v_tru0, v_pre0, v_tru1, v_pre1, v_tru2, v_pre2
def validation(model, device, optimizer, criterion, test_loader):
# set model as testing mode
model.eval()
with torch.no_grad():
test_loss = []
epoch_loss = 0
v_pre0 = []
v_tru0 = []
v_pre1 = []
v_tru1 = []
v_pre2 = []
v_tru2 = []
for images, labels in test_loader:
images = images.to(device)
labels = labels.to(device)
outputs = model(images)
loss_x = criterion(outputs[:,0], labels[:,0])
loss_y = criterion(outputs[:,1], labels[:,1])
loss_z = criterion(outputs[:,2], labels[:,2])
loss = loss_x+loss_y+loss_z
#loss = reduce_value(loss, average=True)
test_loss.append(loss.item())
epoch_loss =(np.mean(test_loss)).tolist()
for idx in range(outputs.size(0)):
v_pre0.append(outputs[idx][0].detach().cpu().numpy())
v_tru0.append(labels[idx][0].detach().cpu().numpy())
v_pre1.append(outputs[idx][1].detach().cpu().numpy())
v_tru1.append(labels[idx][1].detach().cpu().numpy())
v_pre2.append(outputs[idx][2].detach().cpu().numpy())
v_tru2.append(labels[idx][2].detach().cpu().numpy())
print (f'Test, Loss: {epoch_loss:.5f}')
return epoch_loss, v_tru0, v_pre0, v_tru1, v_pre1, v_tru2, v_pre2
start =time.time()
print (f'memory usage: {psutil.Process(os.getpid()).memory_info().rss / 1024 / 1024 / 1024 :.4f} GB' )
# record training process
epoch_train_losses = []
epoch_test_losses = []
epoch_test_tru0 = []
epoch_test_pre0 = []
epoch_test_tru1 = []
epoch_test_pre1 = []
epoch_test_tru2 = []
epoch_test_pre2 = []
epoch_best = -1
epoch_best_loss = 1e3
epoch_Learning_Rates = []
# start training
for epoch in range(NUM_EPOCHS):
train_losses, Learning_rate,train_tru0, train_pre0, train_tru1, train_pr1, train_tru2, train_pre2= train(log_interval, model, device, train_loader, optimizer, criterion, epoch, NUM_EPOCHS, scheduler, scaler, grad_clip=GRAD_CLIP)
test_losses, test_tru0, test_pre0, test_tru1, test_pre1, test_tru2, test_pre2 = validation(model, device, optimizer, criterion, test_loader)
# save the best epoch
if(test_losses<epoch_best_loss) :
epoch_best_loss=test_losses
epoch_best=epoch
# save model
torch.save(model.state_dict(), os.path.join(save_model_path,network+"_best-model-parameters.pt"))
# for multiple GPU:
#torch.save(model.module.state_dict(), os.path.join(save_model_path,network+"_best-model-parameters.pt"))
# save train/test results
epoch_Learning_Rates.append(Learning_rate)
epoch_train_losses.append(train_losses)
epoch_test_losses.append(test_losses)
epoch_test_tru0.append(test_tru0)
epoch_test_pre0.append(test_pre0)
epoch_test_tru1.append(test_tru1)
epoch_test_pre1.append(test_pre1)
epoch_test_tru2.append(test_tru2)
epoch_test_pre2.append(test_pre2)
end = time.time()
print(f'\nRunning time: {(end-start)/60:.2f} min')
print(f'epoch_best_loss = {epoch_best_loss:.6f}, epoch_best = {epoch_best:.0f}, lr_best = {epoch_Learning_Rates[epoch_best][-1]:.2E}')
print("Done!")
memory usage: 2.7624 GB Train, Epoch [1/15], Step [100/232], LR=6.12E-05, Loss: 0.59333 Train, Epoch [1/15], Step [200/232], LR=1.24E-04, Loss: 0.53423 Test, Loss: 0.53817 Train, Epoch [2/15], Step [100/232], LR=2.59E-04, Loss: 0.45378 Train, Epoch [2/15], Step [200/232], LR=3.91E-04, Loss: 0.39246 Test, Loss: 0.43332 Train, Epoch [3/15], Step [100/232], LR=5.80E-04, Loss: 0.39894 Train, Epoch [3/15], Step [200/232], LR=7.18E-04, Loss: 0.37493 Test, Loss: 0.45041 Train, Epoch [4/15], Step [100/232], LR=8.72E-04, Loss: 0.34068 Train, Epoch [4/15], Step [200/232], LR=9.53E-04, Loss: 0.30082 Test, Loss: 0.39480 Train, Epoch [5/15], Step [100/232], LR=9.99E-04, Loss: 0.28256 Train, Epoch [5/15], Step [200/232], LR=9.97E-04, Loss: 0.22916 Test, Loss: 0.32349 Train, Epoch [6/15], Step [100/232], LR=9.81E-04, Loss: 0.27618 Train, Epoch [6/15], Step [200/232], LR=9.59E-04, Loss: 0.22244 Test, Loss: 0.27823 Train, Epoch [7/15], Step [100/232], LR=9.19E-04, Loss: 0.27027 Train, Epoch [7/15], Step [200/232], LR=8.80E-04, Loss: 0.23215 Test, Loss: 0.23268 Train, Epoch [8/15], Step [100/232], LR=8.20E-04, Loss: 0.18421 Train, Epoch [8/15], Step [200/232], LR=7.68E-04, Loss: 0.14332 Test, Loss: 0.20657 Train, Epoch [9/15], Step [100/232], LR=6.92E-04, Loss: 0.22331 Train, Epoch [9/15], Step [200/232], LR=6.31E-04, Loss: 0.11996 Test, Loss: 0.17660 Train, Epoch [10/15], Step [100/232], LR=5.48E-04, Loss: 0.17564 Train, Epoch [10/15], Step [200/232], LR=4.83E-04, Loss: 0.10377 Test, Loss: 0.17561 Train, Epoch [11/15], Step [100/232], LR=3.99E-04, Loss: 0.13386 Train, Epoch [11/15], Step [200/232], LR=3.37E-04, Loss: 0.09563 Test, Loss: 0.16085 Train, Epoch [12/15], Step [100/232], LR=2.59E-04, Loss: 0.15647 Train, Epoch [12/15], Step [200/232], LR=2.05E-04, Loss: 0.08841 Test, Loss: 0.15706 Train, Epoch [13/15], Step [100/232], LR=1.41E-04, Loss: 0.12193 Train, Epoch [13/15], Step [200/232], LR=9.89E-05, Loss: 0.08674 Test, Loss: 0.14486 Train, Epoch [14/15], Step [100/232], LR=5.41E-05, Loss: 0.15558 Train, Epoch [14/15], Step [200/232], LR=2.87E-05, Loss: 0.08408 Test, Loss: 0.14363 Train, Epoch [15/15], Step [100/232], LR=7.23E-06, Loss: 0.14586 Train, Epoch [15/15], Step [200/232], LR=4.30E-07, Loss: 0.06049 Test, Loss: 0.14225 Running time: 71.01 min epoch_best_loss = 0.142251, epoch_best = 14, lr_best = 4.00E-09 Done!
mpl.rcParams["figure.dpi"] = 100
print(f'NUM_EPOCHS={NUM_EPOCHS}, epoch_best={epoch_best}')
NUM_EPOCHS=15, epoch_best=14
loss_train = np.array(epoch_train_losses)
loss_test = np.array(epoch_test_losses)
# read true and predicted (dx, dy, dz) from the best epoch results
x_tru = np.array(epoch_test_tru0)[epoch_best]
y_tru = np.array(epoch_test_tru1)[epoch_best]
z_tru = np.array(epoch_test_tru2)[epoch_best]
x_pre = np.array(epoch_test_pre0)[epoch_best]
y_pre = np.array(epoch_test_pre1)[epoch_best]
z_pre = np.array(epoch_test_pre2)[epoch_best]
print(f'Number of events in test set {x_tru.size}')
# convert (dx, dy, dz) to (theta)
vTrue_sintheta=np.sqrt(np.power(y_tru,2)+np.power(x_tru,2))
vPre_sintheta =np.sqrt(np.power(y_pre,2)+np.power(x_pre,2))
vTrue_theta= np.rad2deg(np.arctan2(vTrue_sintheta,z_tru))
vPre_theta = np.rad2deg(np.arctan2(vPre_sintheta,z_pre))
# bias between predicted and true theta
result_theta = vPre_theta - vTrue_theta
# convert (dx, dy, dz) to (phi)
vTrue_phi = np.rad2deg(np.arctan2(y_tru, x_tru))
vPre_phi = np.rad2deg(np.arctan2(y_pre, x_pre))
# bias between predicted and true phi
result_phi = vPre_phi - vTrue_phi
result_phi = (result_phi + 180) % (2 * 180) - 180
Number of events in test set 928
plt.figure(figsize=(14,4))
plt.suptitle("Truth information", fontsize=18)
plt.subplot(131)
plt.hist(x_tru,bins=100,density=False)
plt.xlabel(r'dir_x', fontsize=18)
plt.subplot(132)
plt.hist(y_tru,bins=100,density=False)
plt.xlabel(r'dir_y', fontsize=18)
plt.subplot(133)
plt.hist(z_tru,bins=100,density=False)
plt.xlabel(r'dir_z', fontsize=18)
plt.show()
fig = plt.figure(figsize=(9,4))
plt.subplot(121)
df=np.array(epoch_Learning_Rates)
df=df.flatten().tolist()
plt.plot(np.arange(0, len(df)), df )
plt.title("Learning Rate", fontsize=18)
plt.xlabel("steps", fontsize=18)
plt.ylabel("Learning rate", fontsize=18)
plt.subplot(122)
plt.plot(np.arange(0, len(train_loader)),epoch_Learning_Rates[epoch_best])
plt.title("Learning Rate", fontsize=18)
plt.xlabel("batch number", fontsize=18)
plt.ylabel("Learning rate", fontsize=18)
plt.tight_layout()
fig = plt.figure(figsize=(5, 5))
plt.plot(np.arange(1, NUM_EPOCHS + 1), loss_train)
plt.plot(np.arange(1, NUM_EPOCHS + 1), loss_test)
plt.title("Model loss", fontsize=18)
plt.grid()
plt.xlabel('epochs', fontsize=18)
plt.ylabel('Loss', fontsize=18)
plt.legend(['train', 'test'])
<matplotlib.legend.Legend at 0x2ab9af5be190>
# plot the comparison between predicted and true theta
fig = plt.figure(figsize=(12, 5))
ax = fig.add_subplot(121)
h, xedge,yedge,patches=plt.hist2d(vTrue_theta, vPre_theta, bins=90, range=((0,180),(0,180)),norm=mpl.colors.LogNorm(),label = 'test dataset ({:.0f} events)'.format(len(vTrue_theta)))
cbar = plt.colorbar()
cbar.ax.set_ylabel('N_testset ({:.0f} events)'.format(len(vTrue_theta)), labelpad=10, rotation=270)
plt.plot([0,180], [0,180], "k-",label="y=x") # plots line y = x
plt.legend()
plt.xlabel(r"True $\theta$", fontsize=18)
plt.ylabel(r"Predicted $\theta$", fontsize=18)
plt.title(r"Validation result")
plt.xlim(0,180)
plt.ylim(0,180)
plt.legend( loc="upper left") #lower right center right
plt.grid( linestyle = '--', linewidth = 0.5)
ax.xaxis.set_major_formatter(StrMethodFormatter(u"{x:.0f}°"))
ax.yaxis.set_major_formatter(StrMethodFormatter(u"{x:.0f}°"))
ax = fig.add_subplot(122)
# the histogram of the data
weight=np.ones_like(result_theta)/float(len(result_theta))
n, hbins, patches = plt.hist(result_theta, 100, weights=weight, facecolor='r', alpha=0.75)
mean2 = np.mean(result_theta)
variance = np.var(result_theta)
sd2 = np.sqrt(variance)
plt.ylabel("P.D.F.", fontsize=18)
plt.xlabel(r"Predicted $\theta$ - True $\theta$ ($^\circ$)", fontsize=18)
#plt.title("TestSet: bias distribution")
plt.xlim(-120,120)
ax.xaxis.set_major_formatter(StrMethodFormatter(u"{x:.0f}°"))
plt.grid(True)
leg = plt.legend([r'mean: {:.2f}$^\circ$' '\n' r'RMS: {:.2f}$^\circ$'.format(mean2,sd2) ], loc="upper right", fontsize=9) #lower right center right
leg.get_frame().set_alpha(0.0)
leg.get_frame().set_linewidth(0.0)
plt.tight_layout()
# convert jupyter notebook to python script
!jupyter nbconvert --to script PointNet2_example.ipynb
[NbConvertApp] WARNING | Config option `kernel_spec_manager_class` not recognized by `NbConvertApp`. [NbConvertApp] Converting notebook PointNet2_example.ipynb to script [NbConvertApp] Writing 22810 bytes to PointNet2_example.py