Bohrium
robot
新建

空间站广场

论文
Notebooks
比赛
课程
Apps
我的主页
我的Notebooks
我的论文库
我的足迹

我的工作空间

任务
节点
文件
数据集
镜像
项目
数据库
公开
Protein data process
Uni-Mol
生物信息学
Uni-Mol生物信息学
Heisenburger
发布于 2023-07-10
赞 1
1
AI4SCUP-CNS-BBB(v1)

Protein pocket data process

代码
文本
[3]
import os
import sys
import torch
import glob
from Bio.Data.PDBData import protein_letters_3to1
import json
from tqdm import tqdm
from multiprocessing import Pool
from functools import lru_cache
from unifold.msa.mmcif import parse
import argparse
import gzip
import numpy as np
from unifold.data.residue_constants import restype_order_with_x
from unifold.msa.templates import _get_atom_positions as get_atom_positions
import pickle
import lmdb
from rdkit import Chem
from rdkit.Chem import AllChem
from scipy.spatial import distance_matrix
from unifold.data.protein import Protein
from unifold.data.protein import to_pdb
import shutil
import datetime
from unifold.data import residue_constants as rc
from unifold.modules.frame import Rotation, Frame
from unicore.utils import batched_gather
import warnings
from unicore.data import BaseWrapperDataset, UnicoreDataset
import torch.nn as nn
from scipy.spatial.transform import Rotation

warnings.filterwarnings("ignore",category=DeprecationWarning)
/opt/conda/lib/python3.8/site-packages/Bio/Data/SCOPData.py:18: BiopythonDeprecationWarning: The 'Bio.Data.SCOPData' module will be deprecated in a future release of Biopython in favor of 'Bio.Data.PDBData.
  warnings.warn(
代码
文本

some consts

代码
文本
[4]
pdb_assembly = json.load(open(os.path.join("mmcif_assembly3_origin.json")) )


res_dict = set([ "ALA", "ARG", "ASN", "ASP", "CYS", "CYX", "GLN", "GLU", "GLY", "HIS",
"HID", "HIE", "HIP", "ILE", "LEU", "LYS", "MET", "PHE", "PRO", "SER",
"THR", "TRP", "TYR", "VAL"])
metal = ['MN', 'CA', 'ZN', 'CU', 'NA', 'FE', 'MG', 'NI', 'CD', 'HG', 'K', 'RH', 'CR', 'PR', 'V', 'CO',
'LA', 'FR', 'LI', 'RB', 'CS', 'SR', 'BA', 'RA', 'CE', 'ND', 'SM', 'EU', 'TH', 'PA',
'U', 'NP', 'PU', 'TI', 'AG', 'AU', 'AL', 'SN', 'PB', 'SB', 'RU', 'BI', 'IR', 'W', 'YB', 'PD',
'ER', 'YB', 'PT', 'RE', 'Y', 'TB', 'LU', 'SC', 'OS', 'GA', 'AM', 'CM', 'FM',
'ES', 'CF', 'IN']

basic_group = set(["DA", "DC", "DG", "DT", "DU", "DI", "A", "C", "G", "T", "U"])
代码
文本

some tool func

代码
文本
[5]
def mmcif_object_to_fasta(mmcif_object, auth_chain_id: str) -> str:
residues = mmcif_object.seqres_to_structure[auth_chain_id]
residue_names = [residues[t].name for t in range(len(residues))]
residue_letters = [
protein_letters_3to1[n] if n in protein_letters_3to1.keys() else "X"
for n in residue_names
]
# take care of cases where residue letters are of length 3
# simply by replacing them as 'X' ('UNK')
filter_out_triple_letters = lambda x: x if len(x) == 1 else "X"
fasta_string = "".join([filter_out_triple_letters(n) for n in residue_letters])
return fasta_string
代码
文本
[6]
def atom37_to_torsion_angles(
aatype,
all_atom_positions,
all_atom_mask
):
aatype = torch.from_numpy(aatype).long()
all_atom_positions = torch.from_numpy(all_atom_positions)
all_atom_mask = torch.from_numpy(all_atom_mask)
if aatype.shape[-1] == 0:
base_shape = aatype.shape
torsion_angles_sin_cos = all_atom_positions.new_zeros(
*base_shape, 7, 2
)
alt_torsion_angles_sin_cos = all_atom_positions.new_zeros(
*base_shape, 7, 2
)
torsion_angles_mask = all_atom_positions.new_zeros(
*base_shape, 7
)
return torsion_angles_sin_cos, alt_torsion_angles_sin_cos, torsion_angles_mask

aatype = torch.clamp(aatype, max=20)

pad = all_atom_positions.new_zeros([*all_atom_positions.shape[:-3], 1, 37, 3])
prev_all_atom_positions = torch.cat(
[pad, all_atom_positions[..., :-1, :, :]], dim=-3
)

pad = all_atom_mask.new_zeros([*all_atom_mask.shape[:-2], 1, 37])
prev_all_atom_mask = torch.cat([pad, all_atom_mask[..., :-1, :]], dim=-2)

pre_omega_atom_pos = torch.cat(
[prev_all_atom_positions[..., 1:3, :], all_atom_positions[..., :2, :]],
dim=-2,
)
phi_atom_pos = torch.cat(
[prev_all_atom_positions[..., 2:3, :], all_atom_positions[..., :3, :]],
dim=-2,
)
psi_atom_pos = torch.cat(
[all_atom_positions[..., :3, :], all_atom_positions[..., 4:5, :]],
dim=-2,
)

pre_omega_mask = torch.prod(prev_all_atom_mask[..., 1:3], dim=-1) * torch.prod(
all_atom_mask[..., :2], dim=-1
)
phi_mask = prev_all_atom_mask[..., 2] * torch.prod(
all_atom_mask[..., :3], dim=-1, dtype=all_atom_mask.dtype
)
psi_mask = (
torch.prod(all_atom_mask[..., :3], dim=-1, dtype=all_atom_mask.dtype)
* all_atom_mask[..., 4]
)

chi_atom_indices = torch.as_tensor(rc.chi_atom_indices, device=aatype.device)

atom_indices = chi_atom_indices[..., aatype, :, :]
chis_atom_pos = batched_gather(
all_atom_positions, atom_indices, -2, len(atom_indices.shape[:-2])
)

chi_angles_mask = list(rc.chi_angles_mask)
chi_angles_mask.append([0.0, 0.0, 0.0, 0.0])
chi_angles_mask = all_atom_mask.new_tensor(chi_angles_mask)

chis_mask = chi_angles_mask[aatype, :]

chi_angle_atoms_mask = batched_gather(
all_atom_mask,
atom_indices,
dim=-1,
num_batch_dims=len(atom_indices.shape[:-2]),
)
chi_angle_atoms_mask = torch.prod(
chi_angle_atoms_mask, dim=-1, dtype=chi_angle_atoms_mask.dtype
)
chis_mask = chis_mask * chi_angle_atoms_mask

torsions_atom_pos = torch.cat(
[
pre_omega_atom_pos[..., None, :, :],
phi_atom_pos[..., None, :, :],
psi_atom_pos[..., None, :, :],
chis_atom_pos,
],
dim=-3,
)

torsion_angles_mask = torch.cat(
[
pre_omega_mask[..., None],
phi_mask[..., None],
psi_mask[..., None],
chis_mask,
],
dim=-1,
)

torsion_frames = Frame.from_3_points(
torsions_atom_pos[..., 1, :],
torsions_atom_pos[..., 2, :],
torsions_atom_pos[..., 0, :],
eps=1e-8,
)

fourth_atom_rel_pos = torsion_frames.invert().apply(torsions_atom_pos[..., 3, :])

torsion_angles_sin_cos = torch.stack(
[fourth_atom_rel_pos[..., 2], fourth_atom_rel_pos[..., 1]], dim=-1
)

denom = torch.sqrt(
torch.sum(
torch.square(torsion_angles_sin_cos),
dim=-1,
dtype=torsion_angles_sin_cos.dtype,
keepdims=True,
)
+ 1e-8
)
torsion_angles_sin_cos = torsion_angles_sin_cos / denom

torsion_angles_sin_cos = (
torsion_angles_sin_cos
* all_atom_mask.new_tensor(
[1.0, 1.0, -1.0, 1.0, 1.0, 1.0, 1.0],
)[((None,) * len(torsion_angles_sin_cos.shape[:-2])) + (slice(None), None)]
)

chi_is_ambiguous = torsion_angles_sin_cos.new_tensor(
rc.chi_pi_periodic,
)[aatype, ...]

mirror_torsion_angles = torch.cat(
[
all_atom_mask.new_ones(*aatype.shape, 3),
1.0 - 2.0 * chi_is_ambiguous,
],
dim=-1,
)

alt_torsion_angles_sin_cos = (
torsion_angles_sin_cos * mirror_torsion_angles[..., None]
)

# consistent to uni-fold. use [1, 0] placeholder
placeholder_torsions = torch.stack(
[
torch.ones(torsion_angles_sin_cos.shape[:-1]),
torch.zeros(torsion_angles_sin_cos.shape[:-1]),
],
dim=-1,
)
torsion_angles_sin_cos = torsion_angles_sin_cos * torsion_angles_mask[
..., None
] + placeholder_torsions * (1 - torsion_angles_mask[..., None])
alt_torsion_angles_sin_cos = alt_torsion_angles_sin_cos * torsion_angles_mask[
..., None
] + placeholder_torsions * (1 - torsion_angles_mask[..., None])

torsion_angles_sin_cos = np.array(torsion_angles_sin_cos)
alt_torsion_angles_sin_cos = np.array(alt_torsion_angles_sin_cos)
torsion_angles_mask = np.array(torsion_angles_mask)

return torsion_angles_sin_cos, alt_torsion_angles_sin_cos, torsion_angles_mask
代码
文本
[7]
def no_pos_dist_array(n):
arr = np.arange(n)
return np.minimum(arr, n - arr - 1)


def convert_to_single_emb(x, sizes):
# [128, 128]
assert x.shape[-1] == len(sizes)
offset = 1
for i in range(len(sizes)):
assert (x[..., i] < sizes[i]).all()
x[..., i] = x[..., i] + offset
offset += sizes[i]
return x
代码
文本

get protein residue sequential and position info

代码
文本
[15]
def get_label(input_args):

mmcif_file, task_id = input_args
pdb_id = os.path.basename(mmcif_file).split(".")[0]
if pdb_id not in pdb_assembly:
return None, "not in assembly"

with gzip.open(mmcif_file, "rb") as fn:
cif_string = fn.read().decode("utf8")
parsing_result = parse(file_id=pdb_id, mmcif_string=cif_string)
mmcif_obj = parsing_result.mmcif_object

information = []
count_all = 0
count = 0
if mmcif_obj is not None:
flag = 1

pdb_assembly_dict = {}
if pdb_id in pdb_assembly:
pdb_assembly_t = pdb_assembly[pdb_id]
for t in range(len(pdb_assembly_t['chains'])):
if pdb_assembly_t['chains'][t] in pdb_assembly_dict:
pdb_assembly_dict[pdb_assembly_t['chains'][t]].append(pdb_assembly_t['opers'][t])
else:
pdb_assembly_dict[pdb_assembly_t['chains'][t]] = [pdb_assembly_t['opers'][t]]
pdb_assembly_str_dict = {}
for item in pdb_assembly_dict:
opers_str = []
for opers in pdb_assembly_dict[item]:
if isinstance(opers, str):
opers = opers
else:
assert isinstance(opers, list)
opers = opers[0] + opers[1]
opers = ' '.join([str(x) for x in opers])
opers_str.append(opers)
pdb_assembly_str_dict[item] = set(opers_str)
mol_files = []
mol_file_path = './mol_data/'+str(task_id)+'/'+str(pdb_id)
mol_files.extend([os.path.join(mol_file_path, _) for _ in os.listdir(mol_file_path) if _.endswith('json') and 'ligand_info' not in _ and _[:3] != 'UNK'])
# if len(mol_files) ==0:
# return None, 'no mol'
if len(mol_files) > 0:
mol_json = json.loads('['+open(mol_file_path+'/'+'ligand_info.json','r').read().strip().strip(',').strip()+']')
mol_opers = {}
for item in mol_json:
if item['ligand_type']=='covalent':
mol_opers[item['ligand'][0]+'_'+item['ligand'][1]+'_'+str(item['ligand'][2])] = {}
continue
mol_chain = []
if isinstance(item['ligand_residues'], list) and len(item['ligand_residues']) > 1:
if isinstance(item['ligand_residues'][1], str) and '_' not in item['ligand_residues'][1]:
item['ligand_residues'] = [item['ligand_residues']]
for res in item['ligand_residues']:
if isinstance(res, str):
chain_id = res.split('_')[1]
else:
assert isinstance(res, list)
chain_id = res[1]
if chain_id not in mol_chain:
mol_chain.append(chain_id)
if chain_id not in pdb_assembly_str_dict:
pdb_assembly_str_dict[chain_id] = set(['no'])
mol_chain = list(set(mol_chain))
origin = pdb_assembly_str_dict[mol_chain[0]]
for chain_id in mol_chain[1:]:
if len(pdb_assembly_str_dict[chain_id] - origin) > 0:
return None, 'no consistant'
if mol_chain[0] not in pdb_assembly_dict:
mol_opers[item['ligand_resname']+'_'+item['ligand_chain']+'_'+str(item['ligand_resid'])] = {}
else:
mol_opers[item['ligand_resname']+'_'+item['ligand_chain']+'_'+str(item['ligand_resid'])] = pdb_assembly_dict[mol_chain[0]]
print(pdb_assembly_dict)
data = {}
data['pdb_id'] = pdb_id
#感觉ion不需要
data['atom_pos'] = []
data['residue_pos'] = []
data['residue_type'] = []
data['pro_opers'] = []
data['chain_id'] = []
data['chain'] = []
data['atom_masks'] = []

data['torsion_angles_sin_cos'] = []
data['torsion_angles_mask'] = []
mol_list = []
mol_idx_list = []
mol_name_list = []
mol_idx = 0
for mol_file in mol_files:
mol_name = mol_file.split('/')[-1].split('.')[0]
if len(mol_opers[mol_name]) ==0:
continue
try:
mol = AllChem.MolFromPDBFile(mol_file[:-5]+'.pdb')
mol_pos = mol.GetConformer().GetPositions()
mol_atom = np.array([atom.GetSymbol() for atom in mol.GetAtoms()])
mol_atom_ = mol_atom != 'H'
if len(mol_pos[mol_atom_]) < 4:
continue
except:
mol_pos = None
pass
if mol_pos is not None:
for opers_id, opers in enumerate(mol_opers[mol_name]):
mol_idx += 1
if opers=='I':
mol_list.append(mol_pos[mol_atom_])
mol_idx_list.append(np.ones(mol_list[-1].shape[0]) * mol_idx)
mol_name_list.append(mol_file[:-5]+'.pdb')
continue
rot, trans = opers
rot = np.array(rot).reshape(3, 3)
trans = np.array(trans).reshape(3)
mol_pos_t = mol_pos @ rot.T + trans
mol_list.append(mol_pos_t[mol_atom_])
mol_idx_list.append(np.ones(mol_list[-1].shape[0]) * mol_idx)
mol_name_list.append(mol_file[:-5]+'_'+str(opers_id+1)+'.pdb')

#分子所有原子展开便于计算:

if len(mol_list) > 0:
mol_list = np.concatenate(mol_list)
mol_idx_list = np.concatenate(mol_idx_list)
assert mol_list.shape[0] == mol_idx_list.shape[0]
data['mol_list'] = mol_list
data['mol_idx_list'] = mol_idx_list


date = mmcif_obj.header["release_date"]
resolution = np.array([mmcif_obj.header["resolution"]])
protein_len = 0
chain_idx = 0
chain_to_id = {}
for chain_id in mmcif_obj.chain_to_seqres:
if chain_id not in pdb_assembly_dict:
continue
label_name = f"{pdb_id}_{chain_id}"
# try:
all_atom_positions, all_atom_mask = get_atom_positions(
mmcif_obj, chain_id, max_ca_ca_distance=float("inf")
)
sequence = mmcif_object_to_fasta(mmcif_obj, chain_id)
aatype_idx = np.array(
[
restype_order_with_x[rn]
if rn in restype_order_with_x
else restype_order_with_x["X"]
for rn in sequence
]
)
seq_len = aatype_idx.shape[0]
_, counts = np.unique(aatype_idx, return_counts=True)
freqs = counts.astype(np.float32) / seq_len
max_freq = np.max(freqs)
if resolution > 9 or max_freq > 0.8:
continue
aatype_index = aatype_idx.astype(np.int8)
origin_pos = all_atom_positions.astype(np.float32)
all_atom_mask = all_atom_mask.astype(np.int8)
torsion_angles_sin_cos, alt_torsion_angles_sin_cos, torsion_angles_mask = atom37_to_torsion_angles(
aatype_index, origin_pos, all_atom_mask
)

residues = mmcif_obj.seqres_to_structure[chain_id]
residue_names = [residues[t].name for t in range(len(residues))]

count_ = 0
opers_id = 0
chain_to_id[chain_id] = chain_idx
if chain_id in pdb_assembly_dict:
chain_idx += 1
count_t = 0
opers_id+=1

if not (len(all_atom_mask) > 0 and (all_atom_mask > 0).any()):
continue
pos = origin_pos
data['atom_pos'].append(pos)
data['residue_pos'].append(np.arange(0, pos.shape[0], 1))
data['residue_type'].append(aatype_index)
data['chain_id'].append(np.ones(pos.shape[0]) * chain_idx)
data['atom_masks'].append(all_atom_mask)
data['pro_opers'].append(pdb_assembly_dict[chain_id])
data['torsion_angles_sin_cos'].append(torsion_angles_sin_cos.reshape(-1, 14))
data['torsion_angles_mask'].append(torsion_angles_mask)
data['chain'].append(chain_id)

protein_len += len(pos)

# ion_data = txn.get(pdb_id.encode())
# if ion_data is not None:
# ion_data = pickle.loads(ion_data)
# data['ion_type'] = ion_data['ion_atom_type']
# data['ion_pos'] = ion_data['ion_pos']
# ion_chain_id = []
# for _ in ion_data['ion_chain_id']:
# if _ in chain_to_id:
# ion_chain_id.append(chain_to_id[_])
# else:
# chain_idx += 1
# chain_to_id[_] = chain_idx
# ion_chain_id.append(chain_idx)
# data['ion_chain_id'] = ion_data['ion_chain_id']
# data['ion_opers'] = [pdb_assembly_dict[_] for _ in ion_data['ion_chain_id']]
if len(data['residue_type'])==0:
assert 1==0, (pdb_id, task_id)
return None, 'no protein'

else:
return None, "Parse mmcif error"
split_date = datetime.datetime.strptime("2022-05-15", "%Y-%m-%d")
date = datetime.datetime.strptime(date, "%Y-%m-%d")
if date < split_date:
return pickle.dumps(data, protocol=-1), 'train'
else:
return pickle.dumps(data, protocol=-1), 'valid'
# return None, None
代码
文本
[21]
def main():
path = r"./mmcif_file/6tan.cif.gz"
data, info = get_label((path, "ah"))
print(pickle.loads(data))
# exit()
# parser = argparse.ArgumentParser()
# parser.add_argument("--mmcif-dir", type=str, default="/your path")
# parser.add_argument("--label-dir", type=str, default="")
# parser.add_argument("--output-fn", type=str, default="")
# parser.add_argument("--debug", action="store_true", default=False)
# args = parser.parse_args()
# print(args)

# os.makedirs(os.path.dirname(args.output_fn), exist_ok=True)
# print("start")

# tasks = ['a' + chr(y) for y in range(97, 123)]
# pro = "5zz8"
# # task_list = ['af']
# # task_list = ['a'+_ for _ in 'abcdefghijklmnopqrstuvwxyz']
# task_list = [task for task in tasks if os.path.isdir('/your path/'+task+'/'+pro)]
# mmcif_files = []
# for item in task_list:
# # pdb_list = [_ for _ in os.listdir('/your path/'+item+'/') if os.path.isdir('/your path/'+item+'/'+_)]
# pdb_list = [pro]
# mmcif_files.extend([(os.path.join(args.mmcif_dir, _+'.cif.gz'), item) for _ in pdb_list])
# print(mmcif_files)
# file_cnt = len(mmcif_files)
# print(f"len(mmcif_files): {len(mmcif_files)}")
# get_label(mmcif_files[0])
# mmcif_files = mmcif_files[9879:]

# outputfilename = os.path.join('/your path', 'train_debug.lmdb')
# try:
# os.remove(outputfilename)
# except:
# pass
# env_new_train = lmdb.open(
# outputfilename,
# subdir=False,
# readonly=False,
# lock=False,
# readahead=False,
# meminit=False,
# max_readers=1,
# map_size=int(100e9),
# )
# txn_write_train = env_new_train.begin(write=True)


# validfilename = os.path.join('/your path', 'valid_debug.lmdb')
# try:
# os.remove(validfilename)
# except:
# pass
# env_new_valid = lmdb.open(
# validfilename,
# subdir=False,
# readonly=False,
# lock=False,
# readahead=False,
# meminit=False,
# max_readers=1,
# map_size=int(100e9),
# )
# txn_write_valid = env_new_valid.begin(write=True)

# len_dict = {}
# with Pool(116) as pool:
# i=0
# j=0
# for ret in tqdm(
# pool.imap(get_label, mmcif_files), total=file_cnt
# ):
# ret, info = ret
# # assert info is None, (ret, info)
# if ret is not None:
# if info == 'train':
# txn_write_train.put(f'{i}'.encode("ascii"), ret)
# i += 1
# if i % 100 == 0:
# txn_write_train.commit()
# txn_write_train = env_new_train.begin(write=True)
# elif info == 'valid':
# txn_write_valid.put(f'{j}'.encode("ascii"), ret)
# j += 1
# if j % 100 == 0:
# txn_write_valid.commit()
# txn_write_valid = env_new_valid.begin(write=True)

# print('{} process {} lines'.format(outputfilename, i))
# txn_write_train.commit()
# env_new_train.close()
# print('{} process {} lines'.format(validfilename, j))
# txn_write_valid.commit()
# env_new_valid.close()
代码
文本
[22]
main()
{'A': ['I']}
{'pdb_id': '6tan', 'atom_pos': [array([[[ 2.6288e+01,  1.8113e+01,  6.4640e+00],
        [ 2.5922e+01,  1.7360e+01,  5.2600e+00],
        [ 2.4557e+01,  1.6742e+01,  5.4120e+00],
        ...,
        [ 0.0000e+00,  0.0000e+00,  0.0000e+00],
        [ 0.0000e+00,  0.0000e+00,  0.0000e+00],
        [ 0.0000e+00,  0.0000e+00,  0.0000e+00]],

       [[ 2.4279e+01,  1.5796e+01,  4.5530e+00],
        [ 2.2969e+01,  1.5124e+01,  4.5300e+00],
        [ 2.3180e+01,  1.3653e+01,  4.1580e+00],
        ...,
        [ 0.0000e+00,  0.0000e+00,  0.0000e+00],
        [ 0.0000e+00,  0.0000e+00,  0.0000e+00],
        [ 0.0000e+00,  0.0000e+00,  0.0000e+00]],

       [[ 2.2241e+01,  1.2809e+01,  4.5430e+00],
        [ 2.2230e+01,  1.1404e+01,  4.1000e+00],
        [ 2.0867e+01,  1.1149e+01,  3.4740e+00],
        ...,
        [ 0.0000e+00,  0.0000e+00,  0.0000e+00],
        [ 0.0000e+00,  0.0000e+00,  0.0000e+00],
        [ 0.0000e+00,  0.0000e+00,  0.0000e+00]],

       ...,

       [[ 2.4879e+01,  2.3000e-02,  8.1450e+00],
        [ 2.5918e+01,  6.1500e-01,  7.2760e+00],
        [ 2.7272e+01,  6.3500e-01,  7.9990e+00],
        ...,
        [ 0.0000e+00,  0.0000e+00,  0.0000e+00],
        [ 2.3937e+01,  5.6870e+00,  4.8650e+00],
        [ 0.0000e+00,  0.0000e+00,  0.0000e+00]],

       [[ 2.7267e+01,  5.8700e-01,  9.3210e+00],
        [ 2.8514e+01,  6.7100e-01,  1.0112e+01],
        [ 2.9242e+01, -6.6800e-01,  1.0030e+01],
        ...,
        [ 0.0000e+00,  0.0000e+00,  0.0000e+00],
        [ 0.0000e+00,  0.0000e+00,  0.0000e+00],
        [ 0.0000e+00,  0.0000e+00,  0.0000e+00]],

       [[ 2.8558e+01, -1.7560e+00,  9.7070e+00],
        [ 2.9186e+01, -3.0940e+00,  9.7320e+00],
        [ 3.0312e+01, -3.1960e+00,  8.7040e+00],
        ...,
        [ 0.0000e+00,  0.0000e+00,  0.0000e+00],
        [ 2.4561e+01, -6.7460e+00,  1.1791e+01],
        [ 3.0233e+01, -2.5460e+00,  7.6300e+00]]], dtype=float32)], 'residue_pos': [array([  0,   1,   2,   3,   4,   5,   6,   7,   8,   9,  10,  11,  12,
        13,  14,  15,  16,  17,  18,  19,  20,  21,  22,  23,  24,  25,
        26,  27,  28,  29,  30,  31,  32,  33,  34,  35,  36,  37,  38,
        39,  40,  41,  42,  43,  44,  45,  46,  47,  48,  49,  50,  51,
        52,  53,  54,  55,  56,  57,  58,  59,  60,  61,  62,  63,  64,
        65,  66,  67,  68,  69,  70,  71,  72,  73,  74,  75,  76,  77,
        78,  79,  80,  81,  82,  83,  84,  85,  86,  87,  88,  89,  90,
        91,  92,  93,  94,  95,  96,  97,  98,  99, 100, 101, 102, 103,
       104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116,
       117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129,
       130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142,
       143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155,
       156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168,
       169])], 'residue_type': [array([ 7, 12, 16,  6, 18, 11, 10, 19, 19, 19,  7,  0,  4,  7, 19,  7, 11,
       15,  0, 10, 16,  9,  5, 10,  9,  5,  2,  8, 13, 19,  3,  6, 18,  3,
       14, 16,  9,  6,  3, 15, 18,  1, 11,  5, 19, 19,  9,  3,  7,  6, 16,
       15, 10, 10,  3,  9, 10,  3, 16,  0,  7,  5,  6,  6, 18, 15,  0, 12,
        1,  3,  5, 18, 12,  1, 16,  7,  6,  7, 13, 10, 10, 19, 13,  0,  9,
        2,  2, 16, 11, 15, 13,  6,  3,  9,  8,  8, 18,  1,  6,  5,  9, 11,
        1, 19, 11,  3, 15,  6,  3, 19, 14, 12, 19, 10, 19,  7,  2, 11, 15,
        3, 10, 14, 15,  1, 16, 19,  3, 16, 11,  5,  0,  5,  3, 10,  0,  1,
       15, 18,  7,  9, 14, 13,  9,  6, 16, 15,  0, 11, 16,  1,  5,  7, 19,
        3,  3,  0, 13, 18, 16, 10, 19,  1,  6,  9,  1, 11,  8, 11,  6, 11],
      dtype=int8)], 'pro_opers': [['I']], 'chain_id': [array([1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.])], 'chain': ['A'], 'atom_masks': [array([[1, 1, 1, ..., 0, 0, 0],
       [1, 1, 1, ..., 0, 0, 0],
       [1, 1, 1, ..., 0, 0, 0],
       ...,
       [1, 1, 1, ..., 0, 1, 0],
       [1, 1, 1, ..., 0, 0, 0],
       [1, 1, 1, ..., 0, 1, 1]], dtype=int8)], 'torsion_angles_sin_cos': [array([[ 1.        ,  0.        ,  1.        , ...,  0.        ,
         1.        ,  0.        ],
       [-0.04181683, -0.9991253 , -0.5784515 , ...,  0.6210887 ,
         1.        ,  0.        ],
       [ 0.14288986, -0.98973864, -0.814764  , ...,  0.        ,
         1.        ,  0.        ],
       ...,
       [-0.09878621, -0.9951087 , -0.93555635, ...,  0.45058557,
        -0.15159912, -0.98844206],
       [-0.05579257, -0.9984424 , -0.9700919 , ..., -0.9168765 ,
         1.        ,  0.        ],
       [-0.09718688, -0.9952661 , -0.8983881 , ..., -0.90295136,
         0.03551648, -0.99936914]], dtype=float32)], 'torsion_angles_mask': [array([[0, 0, 1, ..., 0, 0, 0],
       [1, 1, 1, ..., 1, 1, 0],
       [1, 1, 1, ..., 0, 0, 0],
       ...,
       [1, 1, 1, ..., 1, 1, 1],
       [1, 1, 1, ..., 1, 1, 0],
       [1, 1, 1, ..., 1, 1, 1]])], 'mol_list': array([[-4.726,  3.431,  3.629],
       [-3.11 ,  1.254,  2.538],
       [-6.793,  2.313,  4.811],
       [-5.757,  3.316,  4.857],
       [-7.954,  2.747,  5.684],
       [-7.563,  2.845,  7.064],
       [-8.498,  4.13 ,  5.318],
       [-9.916,  4.16 ,  5.547],
       [-7.782,  5.05 ,  6.282],
       [-8.569,  6.169,  6.556],
       [-7.697,  4.188,  7.518],
       [-6.224,  4.867, 12.362],
       [-5.48 ,  3.575,  2.338],
       [-3.997,  0.395,  1.659],
       [-7.497,  4.801, 11.871],
       [-8.527,  4.875, 12.733],
       [-3.769,  4.486,  4.021],
       [-2.201,  0.393,  3.425],
       [-7.733,  4.637, 10.581],
       [-4.091,  1.957,  3.673],
       [-2.4  ,  2.39 ,  1.803],
       [-6.657,  4.563,  9.758],
       [-5.288,  4.661, 10.228],
       [-5.087,  4.804, 11.638],
       [-3.972,  4.831, 12.198],
       [-4.454,  4.514,  9.173],
       [-5.218,  4.37 ,  8.06 ],
       [-6.56 ,  4.407,  8.428]]), 'mol_idx_list': array([1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.])}
代码
文本
Uni-Mol
生物信息学
Uni-Mol生物信息学
已赞1
推荐阅读
公开
HPLC retention time prediction
pythonHPLC
pythonHPLC
bohr6ef000
更新于 2024-09-09
公开
train.ipynb
AI4Scup-PreEnzy-Inspiration
AI4Scup-PreEnzy-Inspiration
Inspiration
更新于 2024-07-05
1 转存文件
评论
 # Protein pocket dat...

Roger

2023-08-01
这篇通过Q-test了吗?
评论