新建
Protein data process
Heisenburger
赞 1
1
目录
数据集
AI4SCUP-CNS-BBB(v1)
Protein pocket data process
代码
文本
[3]
import os
import sys
import torch
import glob
from Bio.Data.PDBData import protein_letters_3to1
import json
from tqdm import tqdm
from multiprocessing import Pool
from functools import lru_cache
from unifold.msa.mmcif import parse
import argparse
import gzip
import numpy as np
from unifold.data.residue_constants import restype_order_with_x
from unifold.msa.templates import _get_atom_positions as get_atom_positions
import pickle
import lmdb
from rdkit import Chem
from rdkit.Chem import AllChem
from scipy.spatial import distance_matrix
from unifold.data.protein import Protein
from unifold.data.protein import to_pdb
import shutil
import datetime
from unifold.data import residue_constants as rc
from unifold.modules.frame import Rotation, Frame
from unicore.utils import batched_gather
import warnings
from unicore.data import BaseWrapperDataset, UnicoreDataset
import torch.nn as nn
from scipy.spatial.transform import Rotation
warnings.filterwarnings("ignore",category=DeprecationWarning)
/opt/conda/lib/python3.8/site-packages/Bio/Data/SCOPData.py:18: BiopythonDeprecationWarning: The 'Bio.Data.SCOPData' module will be deprecated in a future release of Biopython in favor of 'Bio.Data.PDBData. warnings.warn(
代码
文本
some consts
代码
文本
[4]
pdb_assembly = json.load(open(os.path.join("mmcif_assembly3_origin.json")) )
res_dict = set([ "ALA", "ARG", "ASN", "ASP", "CYS", "CYX", "GLN", "GLU", "GLY", "HIS",
"HID", "HIE", "HIP", "ILE", "LEU", "LYS", "MET", "PHE", "PRO", "SER",
"THR", "TRP", "TYR", "VAL"])
metal = ['MN', 'CA', 'ZN', 'CU', 'NA', 'FE', 'MG', 'NI', 'CD', 'HG', 'K', 'RH', 'CR', 'PR', 'V', 'CO',
'LA', 'FR', 'LI', 'RB', 'CS', 'SR', 'BA', 'RA', 'CE', 'ND', 'SM', 'EU', 'TH', 'PA',
'U', 'NP', 'PU', 'TI', 'AG', 'AU', 'AL', 'SN', 'PB', 'SB', 'RU', 'BI', 'IR', 'W', 'YB', 'PD',
'ER', 'YB', 'PT', 'RE', 'Y', 'TB', 'LU', 'SC', 'OS', 'GA', 'AM', 'CM', 'FM',
'ES', 'CF', 'IN']
basic_group = set(["DA", "DC", "DG", "DT", "DU", "DI", "A", "C", "G", "T", "U"])
代码
文本
some tool func
代码
文本
[5]
def mmcif_object_to_fasta(mmcif_object, auth_chain_id: str) -> str:
residues = mmcif_object.seqres_to_structure[auth_chain_id]
residue_names = [residues[t].name for t in range(len(residues))]
residue_letters = [
protein_letters_3to1[n] if n in protein_letters_3to1.keys() else "X"
for n in residue_names
]
# take care of cases where residue letters are of length 3
# simply by replacing them as 'X' ('UNK')
filter_out_triple_letters = lambda x: x if len(x) == 1 else "X"
fasta_string = "".join([filter_out_triple_letters(n) for n in residue_letters])
return fasta_string
代码
文本
[6]
def atom37_to_torsion_angles(
aatype,
all_atom_positions,
all_atom_mask
):
aatype = torch.from_numpy(aatype).long()
all_atom_positions = torch.from_numpy(all_atom_positions)
all_atom_mask = torch.from_numpy(all_atom_mask)
if aatype.shape[-1] == 0:
base_shape = aatype.shape
torsion_angles_sin_cos = all_atom_positions.new_zeros(
*base_shape, 7, 2
)
alt_torsion_angles_sin_cos = all_atom_positions.new_zeros(
*base_shape, 7, 2
)
torsion_angles_mask = all_atom_positions.new_zeros(
*base_shape, 7
)
return torsion_angles_sin_cos, alt_torsion_angles_sin_cos, torsion_angles_mask
aatype = torch.clamp(aatype, max=20)
pad = all_atom_positions.new_zeros([*all_atom_positions.shape[:-3], 1, 37, 3])
prev_all_atom_positions = torch.cat(
[pad, all_atom_positions[..., :-1, :, :]], dim=-3
)
pad = all_atom_mask.new_zeros([*all_atom_mask.shape[:-2], 1, 37])
prev_all_atom_mask = torch.cat([pad, all_atom_mask[..., :-1, :]], dim=-2)
pre_omega_atom_pos = torch.cat(
[prev_all_atom_positions[..., 1:3, :], all_atom_positions[..., :2, :]],
dim=-2,
)
phi_atom_pos = torch.cat(
[prev_all_atom_positions[..., 2:3, :], all_atom_positions[..., :3, :]],
dim=-2,
)
psi_atom_pos = torch.cat(
[all_atom_positions[..., :3, :], all_atom_positions[..., 4:5, :]],
dim=-2,
)
pre_omega_mask = torch.prod(prev_all_atom_mask[..., 1:3], dim=-1) * torch.prod(
all_atom_mask[..., :2], dim=-1
)
phi_mask = prev_all_atom_mask[..., 2] * torch.prod(
all_atom_mask[..., :3], dim=-1, dtype=all_atom_mask.dtype
)
psi_mask = (
torch.prod(all_atom_mask[..., :3], dim=-1, dtype=all_atom_mask.dtype)
* all_atom_mask[..., 4]
)
chi_atom_indices = torch.as_tensor(rc.chi_atom_indices, device=aatype.device)
atom_indices = chi_atom_indices[..., aatype, :, :]
chis_atom_pos = batched_gather(
all_atom_positions, atom_indices, -2, len(atom_indices.shape[:-2])
)
chi_angles_mask = list(rc.chi_angles_mask)
chi_angles_mask.append([0.0, 0.0, 0.0, 0.0])
chi_angles_mask = all_atom_mask.new_tensor(chi_angles_mask)
chis_mask = chi_angles_mask[aatype, :]
chi_angle_atoms_mask = batched_gather(
all_atom_mask,
atom_indices,
dim=-1,
num_batch_dims=len(atom_indices.shape[:-2]),
)
chi_angle_atoms_mask = torch.prod(
chi_angle_atoms_mask, dim=-1, dtype=chi_angle_atoms_mask.dtype
)
chis_mask = chis_mask * chi_angle_atoms_mask
torsions_atom_pos = torch.cat(
[
pre_omega_atom_pos[..., None, :, :],
phi_atom_pos[..., None, :, :],
psi_atom_pos[..., None, :, :],
chis_atom_pos,
],
dim=-3,
)
torsion_angles_mask = torch.cat(
[
pre_omega_mask[..., None],
phi_mask[..., None],
psi_mask[..., None],
chis_mask,
],
dim=-1,
)
torsion_frames = Frame.from_3_points(
torsions_atom_pos[..., 1, :],
torsions_atom_pos[..., 2, :],
torsions_atom_pos[..., 0, :],
eps=1e-8,
)
fourth_atom_rel_pos = torsion_frames.invert().apply(torsions_atom_pos[..., 3, :])
torsion_angles_sin_cos = torch.stack(
[fourth_atom_rel_pos[..., 2], fourth_atom_rel_pos[..., 1]], dim=-1
)
denom = torch.sqrt(
torch.sum(
torch.square(torsion_angles_sin_cos),
dim=-1,
dtype=torsion_angles_sin_cos.dtype,
keepdims=True,
)
+ 1e-8
)
torsion_angles_sin_cos = torsion_angles_sin_cos / denom
torsion_angles_sin_cos = (
torsion_angles_sin_cos
* all_atom_mask.new_tensor(
[1.0, 1.0, -1.0, 1.0, 1.0, 1.0, 1.0],
)[((None,) * len(torsion_angles_sin_cos.shape[:-2])) + (slice(None), None)]
)
chi_is_ambiguous = torsion_angles_sin_cos.new_tensor(
rc.chi_pi_periodic,
)[aatype, ...]
mirror_torsion_angles = torch.cat(
[
all_atom_mask.new_ones(*aatype.shape, 3),
1.0 - 2.0 * chi_is_ambiguous,
],
dim=-1,
)
alt_torsion_angles_sin_cos = (
torsion_angles_sin_cos * mirror_torsion_angles[..., None]
)
# consistent to uni-fold. use [1, 0] placeholder
placeholder_torsions = torch.stack(
[
torch.ones(torsion_angles_sin_cos.shape[:-1]),
torch.zeros(torsion_angles_sin_cos.shape[:-1]),
],
dim=-1,
)
torsion_angles_sin_cos = torsion_angles_sin_cos * torsion_angles_mask[
..., None
] + placeholder_torsions * (1 - torsion_angles_mask[..., None])
alt_torsion_angles_sin_cos = alt_torsion_angles_sin_cos * torsion_angles_mask[
..., None
] + placeholder_torsions * (1 - torsion_angles_mask[..., None])
torsion_angles_sin_cos = np.array(torsion_angles_sin_cos)
alt_torsion_angles_sin_cos = np.array(alt_torsion_angles_sin_cos)
torsion_angles_mask = np.array(torsion_angles_mask)
return torsion_angles_sin_cos, alt_torsion_angles_sin_cos, torsion_angles_mask
代码
文本
[7]
def no_pos_dist_array(n):
arr = np.arange(n)
return np.minimum(arr, n - arr - 1)
def convert_to_single_emb(x, sizes):
# [128, 128]
assert x.shape[-1] == len(sizes)
offset = 1
for i in range(len(sizes)):
assert (x[..., i] < sizes[i]).all()
x[..., i] = x[..., i] + offset
offset += sizes[i]
return x
代码
文本
get protein residue sequential and position info
代码
文本
[15]
def get_label(input_args):
mmcif_file, task_id = input_args
pdb_id = os.path.basename(mmcif_file).split(".")[0]
if pdb_id not in pdb_assembly:
return None, "not in assembly"
with gzip.open(mmcif_file, "rb") as fn:
cif_string = fn.read().decode("utf8")
parsing_result = parse(file_id=pdb_id, mmcif_string=cif_string)
mmcif_obj = parsing_result.mmcif_object
information = []
count_all = 0
count = 0
if mmcif_obj is not None:
flag = 1
pdb_assembly_dict = {}
if pdb_id in pdb_assembly:
pdb_assembly_t = pdb_assembly[pdb_id]
for t in range(len(pdb_assembly_t['chains'])):
if pdb_assembly_t['chains'][t] in pdb_assembly_dict:
pdb_assembly_dict[pdb_assembly_t['chains'][t]].append(pdb_assembly_t['opers'][t])
else:
pdb_assembly_dict[pdb_assembly_t['chains'][t]] = [pdb_assembly_t['opers'][t]]
pdb_assembly_str_dict = {}
for item in pdb_assembly_dict:
opers_str = []
for opers in pdb_assembly_dict[item]:
if isinstance(opers, str):
opers = opers
else:
assert isinstance(opers, list)
opers = opers[0] + opers[1]
opers = ' '.join([str(x) for x in opers])
opers_str.append(opers)
pdb_assembly_str_dict[item] = set(opers_str)
mol_files = []
mol_file_path = './mol_data/'+str(task_id)+'/'+str(pdb_id)
mol_files.extend([os.path.join(mol_file_path, _) for _ in os.listdir(mol_file_path) if _.endswith('json') and 'ligand_info' not in _ and _[:3] != 'UNK'])
# if len(mol_files) ==0:
# return None, 'no mol'
if len(mol_files) > 0:
mol_json = json.loads('['+open(mol_file_path+'/'+'ligand_info.json','r').read().strip().strip(',').strip()+']')
mol_opers = {}
for item in mol_json:
if item['ligand_type']=='covalent':
mol_opers[item['ligand'][0]+'_'+item['ligand'][1]+'_'+str(item['ligand'][2])] = {}
continue
mol_chain = []
if isinstance(item['ligand_residues'], list) and len(item['ligand_residues']) > 1:
if isinstance(item['ligand_residues'][1], str) and '_' not in item['ligand_residues'][1]:
item['ligand_residues'] = [item['ligand_residues']]
for res in item['ligand_residues']:
if isinstance(res, str):
chain_id = res.split('_')[1]
else:
assert isinstance(res, list)
chain_id = res[1]
if chain_id not in mol_chain:
mol_chain.append(chain_id)
if chain_id not in pdb_assembly_str_dict:
pdb_assembly_str_dict[chain_id] = set(['no'])
mol_chain = list(set(mol_chain))
origin = pdb_assembly_str_dict[mol_chain[0]]
for chain_id in mol_chain[1:]:
if len(pdb_assembly_str_dict[chain_id] - origin) > 0:
return None, 'no consistant'
if mol_chain[0] not in pdb_assembly_dict:
mol_opers[item['ligand_resname']+'_'+item['ligand_chain']+'_'+str(item['ligand_resid'])] = {}
else:
mol_opers[item['ligand_resname']+'_'+item['ligand_chain']+'_'+str(item['ligand_resid'])] = pdb_assembly_dict[mol_chain[0]]
print(pdb_assembly_dict)
data = {}
data['pdb_id'] = pdb_id
#感觉ion不需要
data['atom_pos'] = []
data['residue_pos'] = []
data['residue_type'] = []
data['pro_opers'] = []
data['chain_id'] = []
data['chain'] = []
data['atom_masks'] = []
data['torsion_angles_sin_cos'] = []
data['torsion_angles_mask'] = []
mol_list = []
mol_idx_list = []
mol_name_list = []
mol_idx = 0
for mol_file in mol_files:
mol_name = mol_file.split('/')[-1].split('.')[0]
if len(mol_opers[mol_name]) ==0:
continue
try:
mol = AllChem.MolFromPDBFile(mol_file[:-5]+'.pdb')
mol_pos = mol.GetConformer().GetPositions()
mol_atom = np.array([atom.GetSymbol() for atom in mol.GetAtoms()])
mol_atom_ = mol_atom != 'H'
if len(mol_pos[mol_atom_]) < 4:
continue
except:
mol_pos = None
pass
if mol_pos is not None:
for opers_id, opers in enumerate(mol_opers[mol_name]):
mol_idx += 1
if opers=='I':
mol_list.append(mol_pos[mol_atom_])
mol_idx_list.append(np.ones(mol_list[-1].shape[0]) * mol_idx)
mol_name_list.append(mol_file[:-5]+'.pdb')
continue
rot, trans = opers
rot = np.array(rot).reshape(3, 3)
trans = np.array(trans).reshape(3)
mol_pos_t = mol_pos @ rot.T + trans
mol_list.append(mol_pos_t[mol_atom_])
mol_idx_list.append(np.ones(mol_list[-1].shape[0]) * mol_idx)
mol_name_list.append(mol_file[:-5]+'_'+str(opers_id+1)+'.pdb')
#分子所有原子展开便于计算:
if len(mol_list) > 0:
mol_list = np.concatenate(mol_list)
mol_idx_list = np.concatenate(mol_idx_list)
assert mol_list.shape[0] == mol_idx_list.shape[0]
data['mol_list'] = mol_list
data['mol_idx_list'] = mol_idx_list
date = mmcif_obj.header["release_date"]
resolution = np.array([mmcif_obj.header["resolution"]])
protein_len = 0
chain_idx = 0
chain_to_id = {}
for chain_id in mmcif_obj.chain_to_seqres:
if chain_id not in pdb_assembly_dict:
continue
label_name = f"{pdb_id}_{chain_id}"
# try:
all_atom_positions, all_atom_mask = get_atom_positions(
mmcif_obj, chain_id, max_ca_ca_distance=float("inf")
)
sequence = mmcif_object_to_fasta(mmcif_obj, chain_id)
aatype_idx = np.array(
[
restype_order_with_x[rn]
if rn in restype_order_with_x
else restype_order_with_x["X"]
for rn in sequence
]
)
seq_len = aatype_idx.shape[0]
_, counts = np.unique(aatype_idx, return_counts=True)
freqs = counts.astype(np.float32) / seq_len
max_freq = np.max(freqs)
if resolution > 9 or max_freq > 0.8:
continue
aatype_index = aatype_idx.astype(np.int8)
origin_pos = all_atom_positions.astype(np.float32)
all_atom_mask = all_atom_mask.astype(np.int8)
torsion_angles_sin_cos, alt_torsion_angles_sin_cos, torsion_angles_mask = atom37_to_torsion_angles(
aatype_index, origin_pos, all_atom_mask
)
residues = mmcif_obj.seqres_to_structure[chain_id]
residue_names = [residues[t].name for t in range(len(residues))]
count_ = 0
opers_id = 0
chain_to_id[chain_id] = chain_idx
if chain_id in pdb_assembly_dict:
chain_idx += 1
count_t = 0
opers_id+=1
if not (len(all_atom_mask) > 0 and (all_atom_mask > 0).any()):
continue
pos = origin_pos
data['atom_pos'].append(pos)
data['residue_pos'].append(np.arange(0, pos.shape[0], 1))
data['residue_type'].append(aatype_index)
data['chain_id'].append(np.ones(pos.shape[0]) * chain_idx)
data['atom_masks'].append(all_atom_mask)
data['pro_opers'].append(pdb_assembly_dict[chain_id])
data['torsion_angles_sin_cos'].append(torsion_angles_sin_cos.reshape(-1, 14))
data['torsion_angles_mask'].append(torsion_angles_mask)
data['chain'].append(chain_id)
protein_len += len(pos)
# ion_data = txn.get(pdb_id.encode())
# if ion_data is not None:
# ion_data = pickle.loads(ion_data)
# data['ion_type'] = ion_data['ion_atom_type']
# data['ion_pos'] = ion_data['ion_pos']
# ion_chain_id = []
# for _ in ion_data['ion_chain_id']:
# if _ in chain_to_id:
# ion_chain_id.append(chain_to_id[_])
# else:
# chain_idx += 1
# chain_to_id[_] = chain_idx
# ion_chain_id.append(chain_idx)
# data['ion_chain_id'] = ion_data['ion_chain_id']
# data['ion_opers'] = [pdb_assembly_dict[_] for _ in ion_data['ion_chain_id']]
if len(data['residue_type'])==0:
assert 1==0, (pdb_id, task_id)
return None, 'no protein'
else:
return None, "Parse mmcif error"
split_date = datetime.datetime.strptime("2022-05-15", "%Y-%m-%d")
date = datetime.datetime.strptime(date, "%Y-%m-%d")
if date < split_date:
return pickle.dumps(data, protocol=-1), 'train'
else:
return pickle.dumps(data, protocol=-1), 'valid'
# return None, None
代码
文本
[21]
def main():
path = r"./mmcif_file/6tan.cif.gz"
data, info = get_label((path, "ah"))
print(pickle.loads(data))
# exit()
# parser = argparse.ArgumentParser()
# parser.add_argument("--mmcif-dir", type=str, default="/your path")
# parser.add_argument("--label-dir", type=str, default="")
# parser.add_argument("--output-fn", type=str, default="")
# parser.add_argument("--debug", action="store_true", default=False)
# args = parser.parse_args()
# print(args)
# os.makedirs(os.path.dirname(args.output_fn), exist_ok=True)
# print("start")
# tasks = ['a' + chr(y) for y in range(97, 123)]
# pro = "5zz8"
# # task_list = ['af']
# # task_list = ['a'+_ for _ in 'abcdefghijklmnopqrstuvwxyz']
# task_list = [task for task in tasks if os.path.isdir('/your path/'+task+'/'+pro)]
# mmcif_files = []
# for item in task_list:
# # pdb_list = [_ for _ in os.listdir('/your path/'+item+'/') if os.path.isdir('/your path/'+item+'/'+_)]
# pdb_list = [pro]
# mmcif_files.extend([(os.path.join(args.mmcif_dir, _+'.cif.gz'), item) for _ in pdb_list])
# print(mmcif_files)
# file_cnt = len(mmcif_files)
# print(f"len(mmcif_files): {len(mmcif_files)}")
# get_label(mmcif_files[0])
# mmcif_files = mmcif_files[9879:]
# outputfilename = os.path.join('/your path', 'train_debug.lmdb')
# try:
# os.remove(outputfilename)
# except:
# pass
# env_new_train = lmdb.open(
# outputfilename,
# subdir=False,
# readonly=False,
# lock=False,
# readahead=False,
# meminit=False,
# max_readers=1,
# map_size=int(100e9),
# )
# txn_write_train = env_new_train.begin(write=True)
# validfilename = os.path.join('/your path', 'valid_debug.lmdb')
# try:
# os.remove(validfilename)
# except:
# pass
# env_new_valid = lmdb.open(
# validfilename,
# subdir=False,
# readonly=False,
# lock=False,
# readahead=False,
# meminit=False,
# max_readers=1,
# map_size=int(100e9),
# )
# txn_write_valid = env_new_valid.begin(write=True)
# len_dict = {}
# with Pool(116) as pool:
# i=0
# j=0
# for ret in tqdm(
# pool.imap(get_label, mmcif_files), total=file_cnt
# ):
# ret, info = ret
# # assert info is None, (ret, info)
# if ret is not None:
# if info == 'train':
# txn_write_train.put(f'{i}'.encode("ascii"), ret)
# i += 1
# if i % 100 == 0:
# txn_write_train.commit()
# txn_write_train = env_new_train.begin(write=True)
# elif info == 'valid':
# txn_write_valid.put(f'{j}'.encode("ascii"), ret)
# j += 1
# if j % 100 == 0:
# txn_write_valid.commit()
# txn_write_valid = env_new_valid.begin(write=True)
# print('{} process {} lines'.format(outputfilename, i))
# txn_write_train.commit()
# env_new_train.close()
# print('{} process {} lines'.format(validfilename, j))
# txn_write_valid.commit()
# env_new_valid.close()
代码
文本
[22]
main()
{'A': ['I']} {'pdb_id': '6tan', 'atom_pos': [array([[[ 2.6288e+01, 1.8113e+01, 6.4640e+00], [ 2.5922e+01, 1.7360e+01, 5.2600e+00], [ 2.4557e+01, 1.6742e+01, 5.4120e+00], ..., [ 0.0000e+00, 0.0000e+00, 0.0000e+00], [ 0.0000e+00, 0.0000e+00, 0.0000e+00], [ 0.0000e+00, 0.0000e+00, 0.0000e+00]], [[ 2.4279e+01, 1.5796e+01, 4.5530e+00], [ 2.2969e+01, 1.5124e+01, 4.5300e+00], [ 2.3180e+01, 1.3653e+01, 4.1580e+00], ..., [ 0.0000e+00, 0.0000e+00, 0.0000e+00], [ 0.0000e+00, 0.0000e+00, 0.0000e+00], [ 0.0000e+00, 0.0000e+00, 0.0000e+00]], [[ 2.2241e+01, 1.2809e+01, 4.5430e+00], [ 2.2230e+01, 1.1404e+01, 4.1000e+00], [ 2.0867e+01, 1.1149e+01, 3.4740e+00], ..., [ 0.0000e+00, 0.0000e+00, 0.0000e+00], [ 0.0000e+00, 0.0000e+00, 0.0000e+00], [ 0.0000e+00, 0.0000e+00, 0.0000e+00]], ..., [[ 2.4879e+01, 2.3000e-02, 8.1450e+00], [ 2.5918e+01, 6.1500e-01, 7.2760e+00], [ 2.7272e+01, 6.3500e-01, 7.9990e+00], ..., [ 0.0000e+00, 0.0000e+00, 0.0000e+00], [ 2.3937e+01, 5.6870e+00, 4.8650e+00], [ 0.0000e+00, 0.0000e+00, 0.0000e+00]], [[ 2.7267e+01, 5.8700e-01, 9.3210e+00], [ 2.8514e+01, 6.7100e-01, 1.0112e+01], [ 2.9242e+01, -6.6800e-01, 1.0030e+01], ..., [ 0.0000e+00, 0.0000e+00, 0.0000e+00], [ 0.0000e+00, 0.0000e+00, 0.0000e+00], [ 0.0000e+00, 0.0000e+00, 0.0000e+00]], [[ 2.8558e+01, -1.7560e+00, 9.7070e+00], [ 2.9186e+01, -3.0940e+00, 9.7320e+00], [ 3.0312e+01, -3.1960e+00, 8.7040e+00], ..., [ 0.0000e+00, 0.0000e+00, 0.0000e+00], [ 2.4561e+01, -6.7460e+00, 1.1791e+01], [ 3.0233e+01, -2.5460e+00, 7.6300e+00]]], dtype=float32)], 'residue_pos': [array([ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169])], 'residue_type': [array([ 7, 12, 16, 6, 18, 11, 10, 19, 19, 19, 7, 0, 4, 7, 19, 7, 11, 15, 0, 10, 16, 9, 5, 10, 9, 5, 2, 8, 13, 19, 3, 6, 18, 3, 14, 16, 9, 6, 3, 15, 18, 1, 11, 5, 19, 19, 9, 3, 7, 6, 16, 15, 10, 10, 3, 9, 10, 3, 16, 0, 7, 5, 6, 6, 18, 15, 0, 12, 1, 3, 5, 18, 12, 1, 16, 7, 6, 7, 13, 10, 10, 19, 13, 0, 9, 2, 2, 16, 11, 15, 13, 6, 3, 9, 8, 8, 18, 1, 6, 5, 9, 11, 1, 19, 11, 3, 15, 6, 3, 19, 14, 12, 19, 10, 19, 7, 2, 11, 15, 3, 10, 14, 15, 1, 16, 19, 3, 16, 11, 5, 0, 5, 3, 10, 0, 1, 15, 18, 7, 9, 14, 13, 9, 6, 16, 15, 0, 11, 16, 1, 5, 7, 19, 3, 3, 0, 13, 18, 16, 10, 19, 1, 6, 9, 1, 11, 8, 11, 6, 11], dtype=int8)], 'pro_opers': [['I']], 'chain_id': [array([1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.])], 'chain': ['A'], 'atom_masks': [array([[1, 1, 1, ..., 0, 0, 0], [1, 1, 1, ..., 0, 0, 0], [1, 1, 1, ..., 0, 0, 0], ..., [1, 1, 1, ..., 0, 1, 0], [1, 1, 1, ..., 0, 0, 0], [1, 1, 1, ..., 0, 1, 1]], dtype=int8)], 'torsion_angles_sin_cos': [array([[ 1. , 0. , 1. , ..., 0. , 1. , 0. ], [-0.04181683, -0.9991253 , -0.5784515 , ..., 0.6210887 , 1. , 0. ], [ 0.14288986, -0.98973864, -0.814764 , ..., 0. , 1. , 0. ], ..., [-0.09878621, -0.9951087 , -0.93555635, ..., 0.45058557, -0.15159912, -0.98844206], [-0.05579257, -0.9984424 , -0.9700919 , ..., -0.9168765 , 1. , 0. ], [-0.09718688, -0.9952661 , -0.8983881 , ..., -0.90295136, 0.03551648, -0.99936914]], dtype=float32)], 'torsion_angles_mask': [array([[0, 0, 1, ..., 0, 0, 0], [1, 1, 1, ..., 1, 1, 0], [1, 1, 1, ..., 0, 0, 0], ..., [1, 1, 1, ..., 1, 1, 1], [1, 1, 1, ..., 1, 1, 0], [1, 1, 1, ..., 1, 1, 1]])], 'mol_list': array([[-4.726, 3.431, 3.629], [-3.11 , 1.254, 2.538], [-6.793, 2.313, 4.811], [-5.757, 3.316, 4.857], [-7.954, 2.747, 5.684], [-7.563, 2.845, 7.064], [-8.498, 4.13 , 5.318], [-9.916, 4.16 , 5.547], [-7.782, 5.05 , 6.282], [-8.569, 6.169, 6.556], [-7.697, 4.188, 7.518], [-6.224, 4.867, 12.362], [-5.48 , 3.575, 2.338], [-3.997, 0.395, 1.659], [-7.497, 4.801, 11.871], [-8.527, 4.875, 12.733], [-3.769, 4.486, 4.021], [-2.201, 0.393, 3.425], [-7.733, 4.637, 10.581], [-4.091, 1.957, 3.673], [-2.4 , 2.39 , 1.803], [-6.657, 4.563, 9.758], [-5.288, 4.661, 10.228], [-5.087, 4.804, 11.638], [-3.972, 4.831, 12.198], [-4.454, 4.514, 9.173], [-5.218, 4.37 , 8.06 ], [-6.56 , 4.407, 8.428]]), 'mol_idx_list': array([1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.])}
代码
文本
已赞1
推荐阅读
公开
HPLC retention time predictionbohr6ef000
更新于 2024-09-09
公开
train.ipynbInspiration
更新于 2024-07-05
1 转存文件
评论
Roger