新建
队伍:T A榜0.7594 Rank4 / B榜0.7759 Rank1
T.T.
推荐镜像 :Third-party software:ai4s-cup-0.1
推荐机型 :c12_m46_1 * NVIDIA GPU B
赞 7
3
9
目录
分享
本notebook并不能完全复现B榜排名所提交的submission,而是简化版本(四个模型融合,本版本A/B榜分别为 0.7564/0.7780),可以完成在线训练和测试,核心代码继承自uni-mol并重写以实现一些自定义功能,比如注册器。这里总结了一些有效的OLED调参经验,分享给大家😃!
有效的提升✨
- 对target预先进行normalization,unimol自带的scalar在修复bug后效果依然差(可能还有细微问题),可参考 使用unimol进行OLED量化属性预测(Baseline改良 LB:0.7238)
- 替换oled pretrain为molecule,同时将dropout强制改为p=0,代码见
NNModel._init_model
,由于分子式中仅包含一个Ir原子,所以在molecule中被标记为UNK,即unknown,不会影响训练,可参考 Dropout是万能的吗?unimol_tools中molecule arch 与 oled arch效果对比 - epochs延长至40并搭配patience=10,同时适当降低lr,缩小batch size,将优化器替换为AdamW,默认weight_decay=1e-2,这些调参的目的主要是降低过拟合,辅之以更长iter
- 坐标进行TTA
test_data['coordinates'] = [i + np.random.randn(*i.shape).clip(-3, 3) * 0.0025 for i in test_data['coordinates']]
- clip plqy
sub['plqy'].clip(0, 1, inplace=True)
可能有效的尝试🔥
- src_distance添加随机噪声,具体见
TorchDataset
,需要注意src_distance首尾均为pad token, 需要将四个角distance设为0 - 对Ir进行单独的token repr提取并分类
- oled搭配dropout=0.5性能会有提升,但受限于oled性能有限,我最终没有融合
- 各种loss
- 各种调参
- TTA + N模型融合,泛化性能max,A榜0.75+,B榜0.77+
- 分开部分target,例如plqy单独训练或者融合'e_ad', 'homo', 'lumo', 'edme'进行训练
小小的建议🙏
- 避免过拟合A榜,A榜作为公开的测试集容易引起大家竞相拟合最终面向A榜编程,性能的提升需要在本地CV验证,当模型本身能力没有较大提升时,A榜性能的提升可能导致B榜性能降低(遵循合集性能不变)
- unimol的UniMolModel/NNModel/Trainer/MolTrain等功能交叉互联,十分难调试,经常在各个类之间跳跃😔,dataset没法做online aug,期待官方能进一步优化
本次最终多个config如下:
for batch_size in [4, 8]:
config = OrderedDict(
target_columns=['plqy', 'e_ad', 'homo', 'lumo', 'edme'], # 'plqy', 'e_ad', 'homo', 'lumo', 'edme'
model_name='unimolv1',
data_type='molecule', # oled, crystal, molecule
task='multilabel_regression', # multilabel_regression, regression
epochs=40,
learning_rate=0.0003,
batch_size=batch_size,
patience=10,
metrics='r2',
loss_key='default', # default
drop_out=0.5, # molecule时无效,强制为0.0
activate_key='default', # default
target_normalize='auto', # auto, none
pre_norm=True,
repeat=1,
lr_type='linear',
optim_type='AdamW',
seed=42,
split_seed=42,
)
exp_suffix, metric, sub = train_and_test(config)
submissions_dict[exp_suffix] = sub
for learning_rate in [0.0002, 0.00025]:
config = OrderedDict(
target_columns=['plqy', 'e_ad', 'homo', 'lumo', 'edme'], # 'plqy', 'e_ad', 'homo', 'lumo', 'edme'
model_name='unimolv1',
data_type='molecule', # oled, crystal, molecule
task='multilabel_regression', # multilabel_regression, regression
epochs=40,
learning_rate=learning_rate,
batch_size=8,
patience=10,
metrics='r2',
loss_key='default', # default
drop_out=0.5, # 0.0
activate_key='default', # default
target_normalize='auto', # auto, none
pre_norm=True,
repeat=1,
lr_type='linear',
optim_type='AdamW',
seed=42,
split_seed=42,
)
exp_suffix, metric, sub = train_and_test(config)
submissions_dict[exp_suffix] = sub
代码
文本
[ ]
import os
import time
import joblib
import json
import numpy as np
import pandas as pd
from collections import OrderedDict
from addict import Dict
import matplotlib.pyplot as plt
from tqdm import tqdm
import logging
from logging import Formatter
from sklearn.model_selection import GroupKFold, KFold, StratifiedKFold
from sklearn.metrics import (
mean_absolute_error,
r2_score
)
import torch
import torch.nn as nn
from torch.nn import functional as F
from torch.utils.data import Dataset
from torch.utils.data import DataLoader as TorchDataLoader
from torch.optim import Adam, AdamW
from torch.nn.utils import clip_grad_norm_
from transformers.optimization import get_linear_schedule_with_warmup, get_cosine_schedule_with_warmup, get_polynomial_decay_schedule_with_warmup
from unimol.utils import Metrics
from unimol.data import DataHub
from unimol.models.unimol import UniMolModel
from unimol.utils import YamlHandler
from unimol.models.loss import GHMC_Loss, FocalLossWithLogits, myCrossEntropyLoss
代码
文本
[ ]
# 需要挂载bohr OLED0912 数据集
DIR_PATH = './oled0912_v1/'
print(os.path.abspath(DIR_PATH))
if not os.path.exists(DIR_PATH):
DIR_PATH = '/bohr/oled0912-f1wb/v1/'
代码
文本
[ ]
np.set_printoptions(precision=3, suppress=False)
logger_initialized = {}
logger_with_file_initialized = {}
Formatter.default_msec_format = '%s.%03d'
def get_root_logger(namespace='Uni-Mol(QSAR)', log_file=None, log_level=logging.INFO, file_mode='a'):
"""
log.info(msg) or higher will print to console and file
log.debug(msg) will only print to file
"""
logger = logging.getLogger(namespace)
if namespace in logger_initialized:
if log_file:
if not logger.handlers or log_file != getattr(logger.handlers[-1], 'baseFilename', None):
# log_file发生变更, logger也进行变更
logger.manager.loggerDict.clear()
logger_initialized.pop(namespace)
logger_with_file_initialized.pop(namespace)
logger = logging.getLogger(namespace)
else:
return logger
else:
return logger
logger.setLevel(logging.DEBUG)
c_formatter = logging.Formatter("%(asctime)s | %(lineno)s | %(levelname)s | %(name)s | %(message)s", "%Y-%m-%d %H:%M:%S")
c_handler = logging.StreamHandler()
c_handler.setLevel(log_level)
c_handler.setFormatter(c_formatter)
logger.addHandler(c_handler)
if log_file:
f_formatter = logging.Formatter("%(asctime)s | %(lineno)s | %(levelname)s | %(name)s | %(message)s", "%Y-%m-%d %H:%M:%S")
f_handler = logging.FileHandler(log_file, encoding='utf-8', mode=file_mode)
f_handler.setLevel(logging.DEBUG)
f_handler.setFormatter(f_formatter)
logger.addHandler(f_handler)
logger_with_file_initialized[namespace] = True
logger_initialized[namespace] = True
return logger
代码
文本
[ ]
class R2Loss(torch.nn.Module):
def __init__(self):
super(R2Loss, self).__init__()
def forward(self, y_pred, y_true):
sse = torch.sum(torch.square(y_true - y_pred), dim=0)
sst = torch.sum(torch.square(y_true - torch.mean(y_true, dim=0)), dim=0)
loss = (1-sse/sst).sum() - 1
return - loss
class WR2Loss(torch.nn.Module):
def __init__(self):
super(WR2Loss, self).__init__()
def forward(self, y_pred, y_true):
W = torch.tensor([0.4, 0.2, 0.1, 0.1, 0.2]).reshape((5, 1)).to(y_true.device)
# print(f"W:{W}")
sse = torch.sum(torch.square(y_true - y_pred), dim=0)
# print(f"sse:{sse}")
sst = torch.sum(torch.square(y_true - torch.mean(y_true, dim=0)), dim=0)
# print(f"sst:{sst}")
loss = torch.einsum("ij,ji->i", (1-sse/sst).reshape(1, 5), W) - 1
return -loss
class WeightedMSELoss(nn.Module):
def __init__(self, weights):
super(WeightedMSELoss, self).__init__()
self.weights = weights
def forward(self, prediction, target):
weights = torch.tensor(self.weights).cuda()
squared_error = (prediction - target) ** 2
weighted_error = squared_error * weights
loss = torch.mean(weighted_error)
return loss
class UniMolModelV2(UniMolModel):
def __init__(self, output_dim=2, data_type='molecule', **params):
super().__init__(output_dim, data_type, **params)
self.Ir_head = self.classification_head
def forward(
self,
src_tokens,
src_distance,
src_coord,
src_edge_type,
gas_id=None,
gas_attr=None,
pressure=None,
temperature=None,
return_repr=False,
**kwargs
):
padding_mask = src_tokens.eq(self.padding_idx)
if not padding_mask.any():
padding_mask = None
x = self.embed_tokens(src_tokens)
def get_dist_features(dist, et):
n_node = dist.size(-1)
gbf_feature = self.gbf(dist, et)
gbf_result = self.gbf_proj(gbf_feature)
graph_attn_bias = gbf_result
graph_attn_bias = graph_attn_bias.permute(0, 3, 1, 2).contiguous()
graph_attn_bias = graph_attn_bias.view(-1, n_node, n_node)
return graph_attn_bias
graph_attn_bias = get_dist_features(src_distance, src_edge_type)
(
encoder_rep,
_,
_,
_,
_,
) = self.encoder(x, padding_mask=padding_mask, attn_mask=graph_attn_bias)
cls_repr = encoder_rep[:, 0, :] # CLS token repr
all_repr = encoder_rep[:, :, :] # all token repr
Ir_repr = encoder_rep[:, 1, :] # Ir token repr
filtered_tensors = []
for tokens in src_tokens:
filtered_tensor = tokens[(tokens != 0) & (tokens != 1) & (tokens != 2)] # filter out BOS(0), EOS(1), PAD(2)
filtered_tensors.append(filtered_tensor)
lengths = [len(filtered_tensor) for filtered_tensor in filtered_tensors] # Compute the lengths of the filtered tensors
cls_atomic_reprs = []
for i in range(len(all_repr)):
atomic_repr = encoder_rep[i, 1:lengths[i]+1, :]
cls_atomic_reprs.append(atomic_repr)
repr_dict = {'cls_repr': cls_repr, 'atomic_reprs': cls_atomic_reprs}
if return_repr:
return repr_dict
if self.data_type == 'mof':
gas_embed = self.gas_embed(gas_id, gas_attr) # shape of gas_embed is [batch_size, gas_dim*2]
env_embed = self.env_embed(pressure, temperature) # shape of gas_embed is [batch_size, env_dim*3]
rep = torch.cat([cls_repr, gas_embed, env_embed], dim=-1)
logits = self.classifier(rep)
else:
logits = self.classification_head(cls_repr) + self.Ir_head(Ir_repr)
return logits
代码
文本
[ ]
NNMODEL_REGISTER = {
'unimolv1': UniMolModel,
'unimolv2': UniMolModelV2,
}
LOSS_RREGISTER = {
'classification': myCrossEntropyLoss,
'multiclass': myCrossEntropyLoss,
'regression': nn.MSELoss(),
'multilabel_classification': {
'default': FocalLossWithLogits,
'bce': nn.BCEWithLogitsLoss(),
'ghm': GHMC_Loss(bins=10, alpha=0.5),
'focal': FocalLossWithLogits,
},
'multilabel_regression': {
'default': nn.MSELoss(),
'bce': nn.BCEWithLogitsLoss(),
'MSELoss': nn.MSELoss(),
'WR2Loss': WR2Loss(),
'R2Loss': R2Loss(),
'WeightedMSELoss': WeightedMSELoss(weights=[2.0, 1.4, 1.0, 1.0, 1.4]),
}
}
ACTIVATION_FN = {
# predict prob shape should be (N, K), especially for binary classification, K equals to 1.
'classification': lambda x: F.softmax(x, dim=-1)[:, 1:],
# softmax is used for multiclass classification
'multiclass': lambda x: F.softmax(x, dim=-1),
'regression': lambda x: x,
# sigmoid is used for multilabel classification
'multilabel_classification': lambda x: F.sigmoid(x),
# no activation function is used for multilabel regression
'multilabel_regression': {
'default': lambda x: x,
'sigmoid': lambda x: F.sigmoid(x),
}
}
OUTPUT_DIM = {
'classification': 2,
'regression': 1,
}
class NNModel(object):
def __init__(self, data, trainer, **params):
self.data = data
self.num_classes = self.data['num_classes']
self.target_scaler = self.data['target_scaler']
self.features = data['unimol_input']
self.model_name = params.get('model_name', 'unimolv1')
self.data_type = params.get('data_type', 'molecule')
self.loss_key = params.get('loss_key', 'default')
self.activate_key = params.get('activate_key', 'default')
self.trainer: Trainer = trainer
self.splitter = self.trainer.splitter
self.model_params = params.copy()
self.task = params['task']
if self.task in OUTPUT_DIM:
self.model_params['output_dim'] = OUTPUT_DIM[self.task]
elif self.task == 'multiclass':
self.model_params['output_dim'] = self.data['multiclass_cnt']
else:
self.model_params['output_dim'] = self.num_classes
self.model_params['device'] = self.trainer.device
self.cv = dict()
self.metrics = self.trainer.metrics
if isinstance(LOSS_RREGISTER[self.task], dict):
self.loss_func = LOSS_RREGISTER[self.task][self.loss_key]
else:
self.loss_func = LOSS_RREGISTER[self.task]
if isinstance(ACTIVATION_FN[self.task], dict):
self.activation_fn = ACTIVATION_FN[self.task][self.activate_key]
else:
self.activation_fn = ACTIVATION_FN[self.task]
self.save_path = self.trainer.save_path
self.trainer.set_seed(self.trainer.seed)
self.model = self._init_model(**self.model_params)
def _init_model(self, model_name, **params):
model: UniMolModelV2 = NNMODEL_REGISTER[model_name](**params)
if params['data_type'] == 'molecule':
model.classification_head.dropout = nn.Dropout(p=0)
else:
if params['drop_out'] > 0:
model.classification_head.dropout = nn.Dropout(p=params['drop_out'])
# if hasattr(model, 'Ir_head'):
# model.Ir_head.dropout = nn.Dropout(p=params['drop_out'])
return model
def collect_data(self, X, y, idx):
assert isinstance(y, np.ndarray), 'y must be numpy array'
if isinstance(X, np.ndarray):
return torch.from_numpy(X[idx]).float(), torch.from_numpy(y[idx])
elif isinstance(X, list):
return {k: v[idx] for k, v in X.items()}, torch.from_numpy(y[idx])
else:
raise ValueError('X must be numpy array or dict')
def run(self):
get_root_logger().info("start training Uni-Mol:{}".format(self.model_name))
X = np.asarray(self.features)
y = np.asarray(self.data['target'])
scaffold = np.asarray(self.data['scaffolds'])
if self.task == 'classification':
y_pred = np.zeros_like(y.reshape(y.shape[0], self.num_classes)).astype(float)
else:
y_pred = np.zeros((y.shape[0], self.model_params['output_dim']))
for fold, (tr_idx, te_idx) in enumerate(self.splitter.split(X, y, scaffold)):
X_train, y_train = X[tr_idx], y[tr_idx]
X_valid, y_valid = X[te_idx], y[te_idx]
traindataset = NNDataset(X_train, y_train, self.model_params.get('aug_level', 0))
validdataset = NNDataset(X_valid, y_valid)
if fold > 0:
# need to initalize model for next fold training
self.model = self._init_model(**self.model_params)
_y_pred = self.trainer.fit_predict(self.model, traindataset, validdataset, self.loss_func, self.activation_fn, self.save_path, fold, self.target_scaler)
y_pred[te_idx] = _y_pred
if 'multiclass_cnt' in self.data:
label_cnt = self.data['multiclass_cnt']
else:
label_cnt = None
get_root_logger().info("fold {0}, result {1}".format(
fold,
self.metrics.cal_metric(
self.data['target_scaler'].inverse_transform(y_valid),
self.data['target_scaler'].inverse_transform(_y_pred),
label_cnt=label_cnt
)
)
)
self.cv['pred'] = y_pred
self.cv['metric'] = self.metrics.cal_metric(
self.data['target_scaler'].inverse_transform(y),
self.data['target_scaler'].inverse_transform(self.cv['pred']))
self.cv['target'] = self.data['target_scaler'].inverse_transform(y)
self.cv['pred'] = self.data['target_scaler'].inverse_transform(self.cv['pred'])
self.dump(self.cv['pred'], self.save_path, 'cv.data')
self.dump(self.cv['target'], self.save_path, 'cv.target')
self.dump(self.cv['metric'], self.save_path, 'metric.result')
get_root_logger().info("Uni-Mol metrics score: \n{}".format(self.cv['metric']))
get_root_logger().info("Uni-Mol & Metric result saved!")
def dump(self, data, dir, name):
path = os.path.join(dir, name)
if not os.path.exists(dir):
os.makedirs(dir)
joblib.dump(data, path)
def evaluate(self, trainer=None, checkpoints_path=None):
get_root_logger().info("start predict NNModel:{}".format(self.model_name))
testdataset = NNDataset(self.features, np.asarray(self.data['target']))
for fold in range(self.splitter.n_splits):
model_path = os.path.join(checkpoints_path, f'model_{fold}.pth')
self.model.load_state_dict(torch.load(
model_path, map_location=self.trainer.device)['model_state_dict'])
_y_pred, _, __ = trainer.predict(self.model, testdataset, self.loss_func, self.activation_fn,
self.save_path, fold, self.target_scaler, epoch=1, load_model=True)
if fold == 0:
y_pred = np.zeros_like(_y_pred)
y_pred += _y_pred
y_pred /= self.splitter.n_splits
self.cv['test_pred'] = y_pred
def count_parameters(self, model):
return sum(p.numel() for p in model.parameters() if p.requires_grad)
def NNDataset(data, label=None, aug_level=0):
return TorchDataset(data, label, aug_level)
class TorchDataset(Dataset):
def __init__(self, data, label=None, aug_level=0):
self.data = data
self.label = label if label is not None else np.zeros((len(data), 1))
self.aug_level = aug_level
def __getitem__(self, idx):
if self.aug_level == 0:
pass
else:
col, row = self.data[idx]['src_distance'].shape
distance_ = np.random.randn(col, row) * np.random.uniform(0.0005, 0.001) * self.aug_level
distance_ = np.triu(distance_)
distance_ += distance_.T - 2 * np.diag(distance_.diagonal())
distance_ = np.clip(distance_, 0, None)
self.data[idx]['src_distance'] += distance_
return self.data[idx], self.label[idx]
def __len__(self):
return len(self.data)
class Splitter(object):
def __init__(self, split_method='5fold_random', seed=42):
self.split_method = split_method
self.seed = seed
self.splitter = self._init_split(self.split_method, self.seed)
self.n_splits = 5
self.skf = None
def _init_split(self, split_method, seed=42):
if split_method == '5fold_random':
splitter = KFold(n_splits=5, shuffle=True, random_state=seed)
elif split_method == '5fold_scaffold':
splitter = GroupKFold(n_splits=5)
else:
raise ValueError('Unknown splitter method: {}'.format(split_method))
return splitter
def split(self, data, target=None, group=None):
if self.split_method in ['5fold_random']:
self.skf = self.splitter.split(data)
elif self.split_method in ['5fold_scaffold']:
self.skf = self.splitter.split(data, target, group)
else:
raise ValueError('Unknown splitter method: {}'.format(self.split_method))
return self.skf
class Trainer(object):
def __init__(self, save_path=None, **params):
self.save_path = save_path
self.task = params.get('task', None)
self.cfg_params = params
if self.task != 'repr':
self.metrics_str = params['metrics']
self.metrics = Metrics(self.task, self.metrics_str)
self._init_trainer(**params)
def _init_trainer(self, **params):
### init common params ###
self.split_method = params.get('split_method', '5fold_random')
self.split_seed = params.get('split_seed', 42)
self.seed = params.get('seed', 42)
self.set_seed(self.seed)
self.splitter = Splitter(self.split_method, self.split_seed)
self.logger_level = int(params.get('logger_level', 1))
### init NN trainer params ###
self.learning_rate = float(params.get('learning_rate', 1e-4))
self.batch_size = params.get('batch_size', 32)
self.max_epochs = params.get('epochs', 50)
self.warmup_ratio = params.get('warmup_ratio', 0.1)
self.patience = params.get('patience', 10)
self.max_norm = params.get('max_norm', 1.0)
self.cuda = params.get('cuda', False)
self.amp = params.get('amp', False)
self.device = torch.device("cuda:0" if torch.cuda.is_available() and self.cuda else "cpu")
self.scaler = torch.cuda.amp.GradScaler() if self.device.type == 'cuda' and self.amp == True else None
def decorate_batch(self, batch, feature_name=None):
return self.decorate_torch_batch(batch)
def decorate_graph_batch(self, batch):
net_input, net_target = {'net_input': batch.to(
self.device)}, batch.y.to(self.device)
if self.task in ['classification', 'multiclass', 'multilabel_classification']:
net_target = net_target.long()
else:
net_target = net_target.float()
return net_input, net_target
def decorate_torch_batch(self, batch):
"""function used to decorate batch data
"""
net_input, net_target = batch
if isinstance(net_input, dict):
net_input, net_target = {
k: v.to(self.device) for k, v in net_input.items()}, net_target.to(self.device)
else:
net_input, net_target = {'net_input': net_input.to(
self.device)}, net_target.to(self.device)
if self.task == 'repr':
net_target = None
elif self.task in ['classification', 'multiclass', 'multilabel_classification']:
net_target = net_target.long()
else:
net_target = net_target.float()
return net_input, net_target
def fit_predict(self, model, train_dataset, valid_dataset, loss_func, activation_fn, dump_dir, fold, target_scaler, feature_name=None):
model = model.to(self.device)
train_dataloader = NNDataLoader(
feature_name=feature_name,
dataset=train_dataset,
batch_size=self.batch_size,
shuffle=True,
collate_fn=model.batch_collate_fn,
drop_last=True,
)
# remove last batch, bs=1 can not work on batchnorm1d
min_val_loss = float("inf")
max_score = float("-inf")
wait = 0
### init optimizer ###
num_training_steps = len(train_dataloader) * self.max_epochs
num_warmup_steps = int(num_training_steps * self.warmup_ratio)
optim_type = self.cfg_params.get('optim_type', 'Adam')
if optim_type == 'AdamW':
weight_decay = self.cfg_params.get('weight_decay', 1e-2)
optimizer = AdamW(model.parameters(), lr=self.learning_rate, eps=1e-6, weight_decay=weight_decay)
else:
weight_decay = self.cfg_params.get('weight_decay', 0)
optimizer = Adam(model.parameters(), lr=self.learning_rate, eps=1e-6, weight_decay=weight_decay)
lr_type = self.cfg_params.get('lr_type', 'linear')
if lr_type == 'cosine':
scheduler = get_cosine_schedule_with_warmup(
optimizer, num_warmup_steps=num_warmup_steps, num_training_steps=num_training_steps)
elif lr_type == 'poly':
scheduler = get_polynomial_decay_schedule_with_warmup(
optimizer, num_warmup_steps=num_warmup_steps, num_training_steps=num_training_steps, power=2)
else:
scheduler = get_linear_schedule_with_warmup(
optimizer, num_warmup_steps=num_warmup_steps, num_training_steps=num_training_steps)
for epoch in range(self.max_epochs):
model = model.train()
# Progress Bar
start_time = time.time()
batch_bar = tqdm(total=len(train_dataloader), dynamic_ncols=True,
leave=False, position=0, desc='Train', ncols=5)
trn_loss = []
for i, batch in enumerate(train_dataloader):
net_input, net_target = self.decorate_batch(
batch, feature_name)
optimizer.zero_grad() # Zero gradients
if self.scaler and self.device.type == 'cuda':
with torch.cuda.amp.autocast():
outputs = model(**net_input)
loss = loss_func(outputs, net_target)
else:
with torch.set_grad_enabled(True):
outputs = model(**net_input)
loss = loss_func(outputs, net_target)
trn_loss.append(float(loss.data))
# tqdm lets you add some details so you can monitor training as you train.
batch_bar.set_postfix(
Epoch="Epoch {}/{}".format(epoch+1, self.max_epochs),
loss="{:.04f}".format(float(sum(trn_loss) / (i + 1))),
lr="{:.04f}".format(float(optimizer.param_groups[0]['lr'])))
if self.scaler and self.device.type == 'cuda':
# This is a replacement for loss.backward()
self.scaler.scale(loss).backward()
# unscale the gradients of optimizer's assigned params in-place
self.scaler.unscale_(optimizer)
# Clip the norm of the gradients to max_norm.
clip_grad_norm_(model.parameters(), self.max_norm)
# This is a replacement for optimizer.step()
self.scaler.step(optimizer)
self.scaler.update()
else:
loss.backward()
clip_grad_norm_(model.parameters(), self.max_norm)
optimizer.step()
scheduler.step()
batch_bar.update()
batch_bar.close()
total_trn_loss = np.mean(trn_loss)
y_preds, val_loss, metric_score = self.predict(
model, valid_dataset, loss_func, activation_fn, dump_dir, fold, target_scaler, epoch, load_model=False, feature_name=feature_name)
end_time = time.time()
total_val_loss = np.mean(val_loss)
_score = list(metric_score.values())[0]
_metric = list(metric_score.keys())[0]
message = 'Epoch [{}/{}] train_loss: {:.4f}, val_loss: {:.4f}, val_{}: {:.4f}, lr: {:.6f}, ' \
'{:.1f}s'.format(epoch+1, self.max_epochs,
total_trn_loss, total_val_loss,
_metric, _score,
optimizer.param_groups[0]['lr'],
(end_time - start_time))
get_root_logger().info(message)
is_early_stop, min_val_loss, wait, max_score = self._early_stop_choice(
wait, total_val_loss, min_val_loss, metric_score, max_score, model, dump_dir, fold, self.patience, epoch)
if is_early_stop:
break
y_preds, _, _ = self.predict(model, valid_dataset, loss_func, activation_fn,
dump_dir, fold, target_scaler, epoch, load_model=True, feature_name=feature_name)
return y_preds
def _early_stop_choice(self, wait, loss, min_loss, metric_score, max_score, model, dump_dir, fold, patience, epoch):
### hpyerparameter need to tune if you want to use early stop, currently find use loss is suitable in benchmark test. ###
if not isinstance(self.metrics_str, str) or self.metrics_str in ['loss', 'none', '']:
# loss 作为早停 直接用trainer里面的早停函数
is_early_stop, min_val_loss, wait = self._judge_early_stop_loss(
wait, loss, min_loss, model, dump_dir, fold, patience, epoch)
else:
# 到metric进行判断
is_early_stop, min_val_loss, wait, max_score = self.metrics._early_stop_choice(
wait, min_loss, metric_score, max_score, model, dump_dir, fold, patience, epoch)
return is_early_stop, min_val_loss, wait, max_score
def _judge_early_stop_loss(self, wait, loss, min_loss, model, dump_dir, fold, patience, epoch):
is_early_stop = False
if loss <= min_loss:
min_loss = loss
wait = 0
info = {'model_state_dict': model.state_dict()}
os.makedirs(dump_dir, exist_ok=True)
torch.save(info, os.path.join(dump_dir, f'model_{fold}.pth'))
elif loss >= min_loss:
wait += 1
if wait == self.patience:
get_root_logger().warning(f'Early stopping at epoch: {epoch+1}')
is_early_stop = True
return is_early_stop, min_loss, wait
def predict(self, model, dataset, loss_func, activation_fn, dump_dir, fold, target_scaler=None, epoch=1, load_model=False, feature_name=None):
model = model.to(self.device)
if load_model == True:
load_model_path = os.path.join(dump_dir, f'model_{fold}.pth')
model_dict = torch.load(load_model_path, map_location=self.device)[
"model_state_dict"]
model.load_state_dict(model_dict)
get_root_logger().info("load model success!")
dataloader = NNDataLoader(
feature_name=feature_name,
dataset=dataset,
batch_size=self.batch_size,
shuffle=False,
collate_fn=model.batch_collate_fn,
)
model = model.eval()
batch_bar = tqdm(total=len(dataloader), dynamic_ncols=True,
position=0, leave=False, desc='val', ncols=5)
val_loss = []
y_preds = []
y_truths = []
for i, batch in enumerate(dataloader):
net_input, net_target = self.decorate_batch(batch, feature_name)
# Get model outputs
with torch.no_grad():
outputs = model(**net_input)
if not load_model:
loss = loss_func(outputs, net_target)
val_loss.append(float(loss.data))
y_preds.append(activation_fn(outputs).cpu().numpy())
y_truths.append(net_target.detach().cpu().numpy())
if not load_model:
batch_bar.set_postfix(
Epoch="Epoch {}/{}".format(epoch+1, self.max_epochs),
loss="{:.04f}".format(float(np.sum(val_loss) / (i + 1))))
batch_bar.update()
y_preds = np.concatenate(y_preds)
y_truths = np.concatenate(y_truths)
try:
label_cnt = model.output_dim
except:
label_cnt = None
if target_scaler is not None:
inverse_y_preds = target_scaler.inverse_transform(y_preds)
inverse_y_truths = target_scaler.inverse_transform(y_truths)
metric_score = self.metrics.cal_metric(
inverse_y_truths, inverse_y_preds, label_cnt=label_cnt) if not load_model else None
else:
metric_score = self.metrics.cal_metric(
y_truths, y_preds, label_cnt=label_cnt) if not load_model else None
batch_bar.close()
return y_preds, val_loss, metric_score
def inference(self, model, dataset, feature_name=None, return_repr=True):
model = model.to(self.device)
dataloader = NNDataLoader(
feature_name=feature_name,
dataset=dataset,
batch_size=self.batch_size,
shuffle=False,
collate_fn=model.batch_collate_fn,
)
model = model.eval()
batch_bar = tqdm(total=len(dataloader), dynamic_ncols=True,
position=0, leave=False, desc='val', ncols=5)
repr_dict = {"cls_repr": [], "atomic_reprs": []}
for i, batch in enumerate(dataloader):
net_input, _ = self.decorate_batch(batch, feature_name)
with torch.no_grad():
outputs = model(return_repr=return_repr, **net_input)
assert isinstance(outputs, dict)
for key, value in outputs.items():
if isinstance(value, list):
value_list = [item.cpu().numpy() for item in value]
repr_dict[key].extend(value_list)
else:
repr_dict[key].extend([value.cpu().numpy()])
repr_dict["cls_repr"] = np.concatenate(repr_dict["cls_repr"]).tolist()
return repr_dict
def set_seed(self, seed):
"""function used to set a random seed
Arguments:
seed {int} -- seed number, will set to torch and numpy
"""
torch.manual_seed(seed)
torch.cuda.manual_seed(seed)
torch.cuda.manual_seed_all(seed)
np.random.seed(seed)
def NNDataLoader(feature_name=None, dataset=None, batch_size=None, shuffle=False, collate_fn=None, drop_last=False):
dataloader = TorchDataLoader(dataset=dataset,
batch_size=batch_size,
shuffle=shuffle,
collate_fn=collate_fn,
drop_last=drop_last)
return dataloader
class MolTrain(object):
def __init__(self,
task='classification',
data_type='molecule',
epochs=10,
learning_rate=1e-4,
batch_size=16,
patience=5,
metrics="none",
split='random',
save_path='./exp',
remove_hs=False,
**kwargs
):
config = {
# data
'smiles_col': "SMILES",
'target_col_prefix': "TARGET",
'target_normalize': "auto",
'anomaly_clean': True,
'smi_strict': False,
'model_name': "unimolv1",
# trainer
'split_method': "5fold_random",
'split_seed': 42,
'seed': 42,
'logger_level': 1,
'patience': 10,
'max_epochs': 100,
'learning_rate': 1e-4,
'warmup_ratio': 0.03,
'batch_size': 16,
'max_norm': 5.0,
'cuda': True,
'amp': True,
**kwargs
}
self.yamlhandler = YamlHandler('/')
config = Dict(config)
config.task = task
config.data_type = data_type
config.epochs = epochs
config.learning_rate = learning_rate
config.batch_size = batch_size
config.patience = patience
config.metrics = metrics
config.split = split
config.remove_hs = remove_hs
self.save_path = save_path
self.config = config
def fit(self, data):
self.datahub = DataHub(data=data, is_train=True,
save_path=self.save_path, **self.config)
self.data = self.datahub.data
self.update_and_save_config()
self.trainer = Trainer(save_path=self.save_path, **self.config)
self.model = NNModel(self.data, self.trainer, **self.config)
get_root_logger().info('Model: \n' + repr(self.model.model))
self.model.run()
scalar = self.data['target_scaler']
y_pred = self.model.cv['pred']
y_true = np.array(self.data['target'])
get_root_logger().info(f"scalar: {scalar.scaler}")
metrics = self.trainer.metrics
if scalar is not None:
y_pred = scalar.inverse_transform(y_pred)
y_true = scalar.inverse_transform(y_true)
if self.config["task"] in ['classification', 'multilabel_classification']:
threshold = metrics.calculate_classification_threshold(y_true, y_pred)
joblib.dump(threshold, os.path.join(self.save_path, 'threshold.dat'))
self.cv_pred = y_pred
return
def update_and_save_config(self):
self.config['num_classes'] = self.data['num_classes']
self.config['target_cols'] = ','.join(self.data['target_cols'])
if self.config['task'] == 'multiclass':
self.config['multiclass_cnt'] = self.data['multiclass_cnt']
if self.config['split'] == 'random':
self.config['split'] = 'random_5fold'
else:
self.config['split'] = 'scaffold_5fold'
if self.save_path is not None:
if not os.path.exists(self.save_path):
get_root_logger().info('Create output directory: {}'.format(self.save_path))
os.makedirs(self.save_path)
else:
get_root_logger().info('Output directory already exists: {}'.format(self.save_path))
get_root_logger().info('Warning: Overwrite output directory: {}'.format(self.save_path))
out_path = os.path.join(self.save_path, 'config.yaml')
self.yamlhandler.write_yaml(
data=self.config, out_file_path=out_path)
return
class MolPredict(object):
def __init__(self, load_model=None):
if not load_model:
raise ValueError("load_model is empty")
self.load_model = load_model
config_path = os.path.join(load_model, 'config.yaml')
self.config = YamlHandler(config_path).read_yaml()
self.config.target_cols = self.config.target_cols.split(',')
self.task = self.config.task
self.target_cols = self.config.target_cols
def predict(self, data, save_path=None, metrics='none'):
self.save_path = save_path
if not metrics or metrics != 'none':
self.config.metrics = metrics
# load test data
self.datahub = DataHub(data=data, is_train=False,
save_path=self.load_model, **self.config)
self.trainer = Trainer(save_path=self.load_model, **self.config)
self.model = NNModel(self.datahub.data, self.trainer, **self.config)
self.model.evaluate(self.trainer, self.load_model)
y_pred = self.model.cv['test_pred']
scalar = self.datahub.data['target_scaler']
if scalar is not None:
y_pred = scalar.inverse_transform(y_pred)
df = self.datahub.data['raw_data'].copy()
predict_cols = ['predict_' + col for col in self.target_cols]
if self.task == 'multiclass' and self.config.multiclass_cnt is not None:
prob_cols = ['prob_' + str(i)
for i in range(self.config.multiclass_cnt)]
df[prob_cols] = y_pred
df[predict_cols] = np.argmax(y_pred, axis=1).reshape(-1, 1)
elif self.task in ['classification', 'multilabel_classification']:
threshold = joblib.load(
open(os.path.join(self.load_model, 'threshold.dat'), "rb"))
prob_cols = ['prob_' + col for col in self.target_cols]
df[prob_cols] = y_pred
df[predict_cols] = (y_pred > threshold).astype(int)
else:
prob_cols = predict_cols
df[predict_cols] = y_pred
if self.save_path:
os.makedirs(self.save_path, exist_ok=True)
if not (df[self.target_cols] == -1.0).all().all():
metrics = self.trainer.metrics.cal_metric(
df[self.target_cols].values, df[prob_cols].values)
get_root_logger().info("final predict metrics score: \n{}".format(metrics))
if self.save_path:
joblib.dump(metrics, os.path.join(
self.save_path, 'test_metric.result'))
else:
df.drop(self.target_cols, axis=1, inplace=True)
if self.save_path:
prefix = data.split(
'/')[-1].split('.')[0] if isinstance(data, str) else 'test'
self.save_predict(df, self.save_path, prefix)
get_root_logger().info("pipeline finish!")
return y_pred
def save_predict(self, data, dir, prefix):
run_id = 0
if not os.path.exists(dir):
os.makedirs(dir)
else:
folders = [x for x in os.listdir(dir)]
while prefix + f'.predict.{run_id}' + '.csv' in folders:
run_id += 1
name = prefix + f'.predict.{run_id}' + '.csv'
path = os.path.join(dir, name)
data.to_csv(path)
get_root_logger().info("save predict result to {}".format(path))
代码
文本
[ ]
class NormalizeTarget():
def __init__(self, target_mean_std=None):
self.target_mean_std = target_mean_std
def normalize(self, data):
if self.target_mean_std is None:
return data
for key in self.target_mean_std:
mean = self.target_mean_std[key]['mean']
std = self.target_mean_std[key]['std']
data[key] = (data[key] - mean) / std
return data
def invert(self, data, invert_rescale=1.0):
if self.target_mean_std is None:
return data
for key in self.target_mean_std:
mean = self.target_mean_std[key]['mean']
std = self.target_mean_std[key]['std']
data[key] = data[key] * std * invert_rescale + mean
return data
def collect_train_test(
target=['plqy', 'e_ad', 'homo', 'lumo', 'edme'],
normlize=True,
repeat=1
):
train = np.load(os.path.join(DIR_PATH, 'train.npz'), allow_pickle=True)
test = np.load(os.path.join(DIR_PATH, 'test.npz'), allow_pickle=True)
train_target = dict(map(lambda x: (x, train[x]), target))
train_target = pd.DataFrame(train_target)
if normlize:
normlizer = NormalizeTarget(train_target.describe().iloc[1:3])
train_target = normlizer.normalize(train_target)
else:
normlizer = NormalizeTarget()
train_target = train_target.values.tolist()
train_coords = [np.array(item) for item in train['coord']]
train_atoms = [list(item) for item in train['symbol']]
if repeat > 1:
train_coords = train_coords * int(repeat)
train_atoms = train_atoms * int(repeat)
train_target = train_target * int(repeat)
train_data = {
'target': train_target,
'coordinates': train_coords,
'atoms': train_atoms
}
test_data = {
'id': test['id'],
'coordinates': [np.array(item) for item in test['coord']],
'atoms': [list(item) for item in test['symbol']],
}
return normlizer, train_data, test_data
代码
文本
[ ]
def train_and_test(cfg):
target_columns = cfg.pop('target_columns')
exp_suffix = '_'.join(target_columns + list(map(str, cfg.values())))
os.makedirs(f'./exp_{exp_suffix}', exist_ok=True)
if os.path.exists(f'./exp_{exp_suffix}/metric.json') and os.path.exists(f'./exp_{exp_suffix}/submission.csv'):
sub = pd.read_csv(f'./exp_{exp_suffix}/submission.csv')
metrics = json.load(open(f'./exp_{exp_suffix}/metric.json'))
if 'competition_metric' in metrics:
return exp_suffix, metrics['competition_metric'], sub
logger = get_root_logger(log_file=f'./exp_{exp_suffix}/root.log')
normlizer, train_data, test_data = collect_train_test(
target=target_columns,
normlize=cfg['pre_norm'],
repeat=cfg['repeat']
)
clf = MolTrain(**cfg,
split='random',
remove_hs=True,
save_path=f'./exp_{exp_suffix}')
logger.info('exp_suffix: ' + exp_suffix)
logger.info('Config: \n' + json.dumps(clf.config, indent=4))
logger.info(train_data['target'][0])
logger.info(train_data['coordinates'][0])
logger.info(train_data['atoms'][0])
if not os.path.exists(f'./exp_{exp_suffix}/metric.result'):
clf.fit(train_data)
metrics = np.load(f'./exp_{exp_suffix}/metric.result', allow_pickle=True)
json.dump(dict(metrics), fp=open(f'./exp_{exp_suffix}/metric.json', mode='w'), indent=4, ensure_ascii=True)
logger.info(metrics)
# Train
if hasattr(clf, 'model'):
df = pd.DataFrame(data=np.asarray(clf.model.cv['target']), columns=target_columns)
train_predicts_df = pd.DataFrame(data=clf.model.cv['pred'], columns=target_columns)
else:
data = np.load(os.path.join(DIR_PATH, 'train.npz'), allow_pickle=True)
df = pd.DataFrame({
'plqy':data['plqy'],
'e_ad':data['e_ad'],
'homo':data['homo'],
'lumo':data['lumo'],
'edme':data['edme']
})
if cfg['pre_norm']:
df = (df - df.mean()) / df.std()
pred = joblib.load(f'./exp_{exp_suffix}/cv.data')
train_predicts_df = pd.DataFrame(data=np.array(pred), columns=target_columns)
if cfg['pre_norm']:
df = normlizer.invert(df, invert_rescale=1.00)
train_predicts_df = normlizer.invert(train_predicts_df, invert_rescale=1.00)
competition_metric = 0
for k in target_columns:
weights = {
'plqy': 0.4,
'e_ad': 0.2,
'homo': 0.1,
'lumo': 0.1,
'edme': 0.2
}
try:
fig, ax = plt.subplots(1, 1, figsize=(5,5))
corr = np.corrcoef(df[k], train_predicts_df[k])[0, 1]
mae = mean_absolute_error(df[k], train_predicts_df[k])
r = np.sqrt(r2_score(df[k], train_predicts_df[k]))
ax.scatter(df[k], train_predicts_df[k], s=2)
ax.axline((0, 0), slope=1, color='black', linestyle='--')
ax.set_title(k)
ax.set_xlim([df[k].min(), df[k].max()])
ax.set_ylim([df[k].min(), df[k].max()])
ax.set_aspect('equal')
ax.set_xlabel('True')
ax.set_ylabel('Pred')
plt.text(0.7, 0.4, f'Corr={corr:.04f}\nR={r:.04f}\nMae={mae:.04f}', ha='left', va='top', transform=ax.transAxes)
plt.savefig(f'./exp_{exp_suffix}/corr_{k}.png')
competition_metric += weights[k] * r * r
get_root_logger().info(f'metric of {k} is corr={corr:.06f}, r2={r*r:.06f}, mae={mae:.06f}')
except:
pass
metrics = np.load(f'./exp_{exp_suffix}/metric.result', allow_pickle=True)
metrics['competition_metric'] = competition_metric
json.dump(dict(metrics), fp=open(f'./exp_{exp_suffix}/metric.json', mode='w'), indent=4, ensure_ascii=True)
get_root_logger().info(f'competition_metric is {competition_metric:.04f}')
# Test
pred_clf = MolPredict(load_model=f'./exp_{exp_suffix}')
predicts = pred_clf.predict(test_data)
sub = pd.DataFrame(data=predicts.copy(), columns=target_columns)
sub = normlizer.invert(sub, invert_rescale=1.00)
sub['id'] = test_data['id']
if 'plqy' in sub.columns:
sub['plqy'].clip(0, 1, inplace=True)
sub.to_csv(f'./exp_{exp_suffix}/submission.csv', index=False, header=True)
logger.info('\n' + sub.describe().to_string())
return exp_suffix, competition_metric, sub
代码
文本
[ ]
if __name__ == '__main__':
submissions_dict = {}
# for model_name in ['unimolv1', 'unimolv2']:
# config = OrderedDict(
# target_columns=['plqy', 'e_ad', 'homo', 'lumo', 'edme'], # 'plqy', 'e_ad', 'homo', 'lumo', 'edme'
# model_name=model_name,
# data_type='oled', # oled, crystal, molecule
# task='multilabel_regression', # multilabel_regression, regression
# epochs=40,
# learning_rate=0.0003,
# batch_size=4,
# patience=10,
# metrics='r2',
# loss_key='WR2Loss', # default
# drop_out=0.5, # 0.0
# activate_key='default', # default
# target_normalize='auto', # auto, none
# pre_norm=True,
# repeat=1,
# lr_type='linear',
# optim_type='AdamW',
# seed=42,
# split_seed=42,
# )
# exp_suffix, metric, sub = train_and_test(config)
# submissions_dict[exp_suffix] = sub
for batch_size in [4, 8]:
config = OrderedDict(
target_columns=['plqy', 'e_ad', 'homo', 'lumo', 'edme'], # 'plqy', 'e_ad', 'homo', 'lumo', 'edme'
model_name='unimolv1',
data_type='molecule', # oled, crystal, molecule
task='multilabel_regression', # multilabel_regression, regression
epochs=40,
learning_rate=0.0003,
batch_size=batch_size,
patience=10,
metrics='r2',
loss_key='default', # default
drop_out=0.5, # 0.0
activate_key='default', # default
target_normalize='auto', # auto, none
pre_norm=True,
repeat=1,
lr_type='linear',
optim_type='AdamW',
seed=42,
split_seed=42,
)
exp_suffix, metric, sub = train_and_test(config)
submissions_dict[exp_suffix] = sub
for learning_rate in [0.0002, 0.00025]:
config = OrderedDict(
target_columns=['plqy', 'e_ad', 'homo', 'lumo', 'edme'], # 'plqy', 'e_ad', 'homo', 'lumo', 'edme'
model_name='unimolv1',
data_type='molecule', # oled, crystal, molecule
task='multilabel_regression', # multilabel_regression, regression
epochs=40,
learning_rate=learning_rate,
batch_size=8,
patience=10,
metrics='r2',
loss_key='default', # default
drop_out=0.5, # 0.0
activate_key='default', # default
target_normalize='auto', # auto, none
pre_norm=True,
repeat=1,
lr_type='linear',
optim_type='AdamW',
seed=42,
split_seed=42,
)
exp_suffix, metric, sub = train_and_test(config)
submissions_dict[exp_suffix] = sub
# submit
all_columns = ['plqy', 'e_ad', 'homo', 'lumo', 'edme']
all_sub = pd.concat(list(submissions_dict.values()))
submission = all_sub.groupby('id').mean()
submission = submission[[i for i in all_columns if i in submission.columns]]
submission.reset_index(inplace=True)
if 'plqy' in submission.columns:
submission['plqy'].clip(0, 1, inplace=True)
submission.to_csv('./submission.csv', index=False, header=True)
代码
文本
已赞7
本文被以下合集收录
AI4S Cup|OLED Notebooks 合集
AI4S Cup
更新于 2024-08-18
9 篇4 人关注
测试合集文章列表100篇
xingyanshi@dp.tech很长的名字xingyanshi@dp.tech很长的名字很长的
更新于 2024-08-04
104 篇2 人关注
推荐阅读
公开
GG A榜(1/0.7649) B榜(3/0.7708)GG
发布于 2023-10-31
4 赞2 转存文件
公开
AI4S Cup - OLED材料量化属性预测 rank2 A榜0.7483 B榜0.7725牛战士从不摘下他的面具
发布于 2023-11-15
6 赞5 转存文件