探究
实验室
计算
公开
使用RFdiffusion进行蛋白质设计
python
protein design
pythonprotein design
dingshizhi
更新于 2025-04-14
推荐镜像 :protein-design-rfdiffusion:version_4
推荐机型 :c8_m32_1 * NVIDIA V100
简介
导入必要的包
单体设计
异源寡聚体设计
同源寡聚体设计
Binder设计
参考资料

本notebook提供了一个简单的流程,介绍如何使用 RFdiffusion 进行蛋白质设计

简介

RFdiffusion 是一种用于结构生成的方法,可以包含或不包含条件信息(如基序、目标等)。它能够执行各种蛋白质设计挑战,如 RFdiffusion 论文 中所述。

有关更详细的说明和示例,请参阅 RFdiffusion 文档

代码
文本

导入必要的包

代码
文本
[1]
import os
import sys
import torch
from omegaconf import OmegaConf
import hydra
from rfdiffusion.inference import utils as iu
from hydra.core.hydra_config import HydraConfig
from rfdiffusion.util import writepdb_multi, writepdb
/opt/mamba/envs/SE3nv/lib/python3.9/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html
  from .autonotebook import tqdm as notebook_tqdm
代码
文本
[18]


def generate_protein(config_dict):
"""
使用 RFdiffusion 生成蛋白质结构
Args:
config_dict (dict): 包含以下键的字典:
- contigs: 定义连续链的字符串
- symmetry: 对称性设置 ('none', 'cyclic', 'dihedral')
- order: 对称性阶数
- output_dir: 输出目录
- num_designs: 生成的设计数量
- deterministic: 是否使用确定性设置
"""

# 加载基础配置
base_conf = OmegaConf.load("/root/RFdiffusion/config/inference/base.yaml")
# 确保必要的配置项存在
if 'contigs' not in config_dict:
raise ValueError("配置中必须包含 'contigs' 参数")
# 设置默认值
config_dict.setdefault('output_dir', 'outputs')
config_dict.setdefault('num_designs', 1)
config_dict.setdefault('deterministic', False)
config_dict.setdefault('design_id', 0)

# 创建正确的配置结构
user_conf = {
'inference': {
'output_prefix': os.path.join(config_dict['output_dir'], 'design'),
'num_designs': config_dict['num_designs'],
'deterministic': config_dict['deterministic'],
'write_trajectory': True,
'dump_pdb': True,
'dump_pdb_path': '/dev/shm'
},
'contigmap': {
'contigs': [config_dict['contigs']]
}
}
# 如果设置了对称性,添加对称性配置
if config_dict.get('symmetry') != 'none':
user_conf['inference']['symmetry'] = config_dict['symmetry']
if config_dict.get('order'):
user_conf['inference']['order'] = config_dict['order']
# 合并配置
conf = OmegaConf.merge(base_conf, OmegaConf.create(user_conf))
# 初始化采样器
sampler = iu.sampler_selector(conf)

# 初始化初始结构和序列
x_init, seq_init = sampler.sample_init()
denoised_xyz_stack = []
px0_xyz_stack = []
seq_stack = []
plddt_stack = []

x_t = torch.clone(x_init)
seq_t = torch.clone(seq_init)

# 从输入的扩散步数开始,到最终步数结束,步长为-1
for t in range(int(sampler.t_step_input), sampler.inf_conf.final_step - 1, -1):
px0, x_t, seq_t, plddt = sampler.sample_step(
t=t, x_t=x_t, seq_init=seq_t, final_step=sampler.inf_conf.final_step
)
px0_xyz_stack.append(px0)
denoised_xyz_stack.append(x_t)
seq_stack.append(seq_t)
plddt_stack.append(plddt[0])

# 保存最终的序列
out_prefix = f"{conf.inference.output_prefix}_{config_dict['design_id']}"
os.makedirs(os.path.dirname(out_prefix), exist_ok=True)
final_seq = seq_stack[-1]

# 输出除了 motif 区域外的甘氨酸
final_seq = torch.where(
torch.argmax(seq_init, dim=-1) == 21, 7, torch.argmax(seq_init, dim=-1)
)

bfacts = torch.ones_like(final_seq.squeeze())
bfacts[torch.where(torch.argmax(seq_init, dim=-1) == 21, True, False)] = 0

# 将列表转换为tensor
denoised_xyz_stack = torch.stack(denoised_xyz_stack)
denoised_xyz_stack = torch.flip(denoised_xyz_stack, [0]) # 翻转顺序以便更好地可视化

# 保存最终的结构文件
out = f"{out_prefix}.pdb"
writepdb(
out,
denoised_xyz_stack[0, :, :4],
final_seq,
sampler.binderlen,
chain_idx=sampler.chain_idx,
bfacts=bfacts,
)

return out_prefix


代码
文本

单体设计

生成一个长度为100的单体

代码
文本
[11]

# 示例配置
config = {
'contigs': '100', # 生成一个长度为 100 的单体
'symmetry': 'none', # 无对称约束
'order': 1, # 不需要复制
'output_dir': 'outputs', # 可选,默认为 'outputs'
'num_designs': 1, # 可选,默认为 1
'deterministic': False, # 可选,默认为 False,设置为True可以得到可重复的结果
'design_id': 0 # 可选,默认为 0
}

# 生成蛋白质
output_path = generate_protein(config)
print(f"Generated protein structure saved to: {output_path}")
Reading models from /root/RFdiffusion/rfdiffusion/inference/../../models
This is inf_conf.ckpt_path
/root/RFdiffusion/rfdiffusion/inference/../../models/Base_ckpt.pt
Assembling -model, -diffuser and -preprocess configs from checkpoint
USING MODEL CONFIG: self._conf[model][n_extra_block] = 4
USING MODEL CONFIG: self._conf[model][n_main_block] = 32
USING MODEL CONFIG: self._conf[model][n_ref_block] = 4
USING MODEL CONFIG: self._conf[model][d_msa] = 256
USING MODEL CONFIG: self._conf[model][d_msa_full] = 64
USING MODEL CONFIG: self._conf[model][d_pair] = 128
USING MODEL CONFIG: self._conf[model][d_templ] = 64
USING MODEL CONFIG: self._conf[model][n_head_msa] = 8
USING MODEL CONFIG: self._conf[model][n_head_pair] = 4
USING MODEL CONFIG: self._conf[model][n_head_templ] = 4
USING MODEL CONFIG: self._conf[model][d_hidden] = 32
USING MODEL CONFIG: self._conf[model][d_hidden_templ] = 32
USING MODEL CONFIG: self._conf[model][p_drop] = 0.15
USING MODEL CONFIG: self._conf[model][SE3_param_full] = {'num_layers': 1, 'num_channels': 32, 'num_degrees': 2, 'n_heads': 4, 'div': 4, 'l0_in_features': 8, 'l0_out_features': 8, 'l1_in_features': 3, 'l1_out_features': 2, 'num_edge_features': 32}
USING MODEL CONFIG: self._conf[model][SE3_param_topk] = {'num_layers': 1, 'num_channels': 32, 'num_degrees': 2, 'n_heads': 4, 'div': 4, 'l0_in_features': 64, 'l0_out_features': 64, 'l1_in_features': 3, 'l1_out_features': 2, 'num_edge_features': 64}
USING MODEL CONFIG: self._conf[model][freeze_track_motif] = False
USING MODEL CONFIG: self._conf[model][use_motif_timestep] = True
USING MODEL CONFIG: self._conf[diffuser][T] = 50
USING MODEL CONFIG: self._conf[diffuser][b_0] = 0.01
USING MODEL CONFIG: self._conf[diffuser][b_T] = 0.07
USING MODEL CONFIG: self._conf[diffuser][schedule_type] = linear
USING MODEL CONFIG: self._conf[diffuser][so3_type] = igso3
USING MODEL CONFIG: self._conf[diffuser][crd_scale] = 0.25
USING MODEL CONFIG: self._conf[diffuser][so3_schedule_type] = linear
USING MODEL CONFIG: self._conf[diffuser][min_b] = 1.5
USING MODEL CONFIG: self._conf[diffuser][max_b] = 2.5
USING MODEL CONFIG: self._conf[diffuser][min_sigma] = 0.02
USING MODEL CONFIG: self._conf[diffuser][max_sigma] = 1.5
USING MODEL CONFIG: self._conf[preprocess][sidechain_input] = False
USING MODEL CONFIG: self._conf[preprocess][motif_sidechain_input] = True
USING MODEL CONFIG: self._conf[preprocess][d_t1d] = 22
USING MODEL CONFIG: self._conf[preprocess][d_t2d] = 44
USING MODEL CONFIG: self._conf[preprocess][prob_self_cond] = 0.5
USING MODEL CONFIG: self._conf[preprocess][str_self_cond] = True
USING MODEL CONFIG: self._conf[preprocess][predict_previous] = False
Successful diffuser __init__
With this beta schedule (linear schedule, beta_0 = 0.04, beta_T = 0.28), alpha_bar_T = 0.00013696048699785024
Generated protein structure saved to: outputs/design_0
代码
文本

异源寡聚体设计

生成一个长度为50和100的异源寡聚体

代码
文本
[19]
config_hetero = {
'contigs': '50-100',
'symmetry': 'none',
'order': 1,
'output_dir': 'outputs',
'num_designs': 1,
'deterministic': False,
'design_id': 1
}
generate_protein(config_hetero)
Reading models from /root/RFdiffusion/rfdiffusion/inference/../../models
This is inf_conf.ckpt_path
/root/RFdiffusion/rfdiffusion/inference/../../models/Base_ckpt.pt
Assembling -model, -diffuser and -preprocess configs from checkpoint
USING MODEL CONFIG: self._conf[model][n_extra_block] = 4
USING MODEL CONFIG: self._conf[model][n_main_block] = 32
USING MODEL CONFIG: self._conf[model][n_ref_block] = 4
USING MODEL CONFIG: self._conf[model][d_msa] = 256
USING MODEL CONFIG: self._conf[model][d_msa_full] = 64
USING MODEL CONFIG: self._conf[model][d_pair] = 128
USING MODEL CONFIG: self._conf[model][d_templ] = 64
USING MODEL CONFIG: self._conf[model][n_head_msa] = 8
USING MODEL CONFIG: self._conf[model][n_head_pair] = 4
USING MODEL CONFIG: self._conf[model][n_head_templ] = 4
USING MODEL CONFIG: self._conf[model][d_hidden] = 32
USING MODEL CONFIG: self._conf[model][d_hidden_templ] = 32
USING MODEL CONFIG: self._conf[model][p_drop] = 0.15
USING MODEL CONFIG: self._conf[model][SE3_param_full] = {'num_layers': 1, 'num_channels': 32, 'num_degrees': 2, 'n_heads': 4, 'div': 4, 'l0_in_features': 8, 'l0_out_features': 8, 'l1_in_features': 3, 'l1_out_features': 2, 'num_edge_features': 32}
USING MODEL CONFIG: self._conf[model][SE3_param_topk] = {'num_layers': 1, 'num_channels': 32, 'num_degrees': 2, 'n_heads': 4, 'div': 4, 'l0_in_features': 64, 'l0_out_features': 64, 'l1_in_features': 3, 'l1_out_features': 2, 'num_edge_features': 64}
USING MODEL CONFIG: self._conf[model][freeze_track_motif] = False
USING MODEL CONFIG: self._conf[model][use_motif_timestep] = True
USING MODEL CONFIG: self._conf[diffuser][T] = 50
USING MODEL CONFIG: self._conf[diffuser][b_0] = 0.01
USING MODEL CONFIG: self._conf[diffuser][b_T] = 0.07
USING MODEL CONFIG: self._conf[diffuser][schedule_type] = linear
USING MODEL CONFIG: self._conf[diffuser][so3_type] = igso3
USING MODEL CONFIG: self._conf[diffuser][crd_scale] = 0.25
USING MODEL CONFIG: self._conf[diffuser][so3_schedule_type] = linear
USING MODEL CONFIG: self._conf[diffuser][min_b] = 1.5
USING MODEL CONFIG: self._conf[diffuser][max_b] = 2.5
USING MODEL CONFIG: self._conf[diffuser][min_sigma] = 0.02
USING MODEL CONFIG: self._conf[diffuser][max_sigma] = 1.5
USING MODEL CONFIG: self._conf[preprocess][sidechain_input] = False
USING MODEL CONFIG: self._conf[preprocess][motif_sidechain_input] = True
USING MODEL CONFIG: self._conf[preprocess][d_t1d] = 22
USING MODEL CONFIG: self._conf[preprocess][d_t2d] = 44
USING MODEL CONFIG: self._conf[preprocess][prob_self_cond] = 0.5
USING MODEL CONFIG: self._conf[preprocess][str_self_cond] = True
USING MODEL CONFIG: self._conf[preprocess][predict_previous] = False
Successful diffuser __init__
With this beta schedule (linear schedule, beta_0 = 0.04, beta_T = 0.28), alpha_bar_T = 0.00013696048699785024
'outputs/design_1'
代码
文本

同源寡聚体设计

生成一个长度为50的同源寡聚体,具有2倍对称性

代码
文本
[ ]
config_homo = {
'contigs': '50',
'symmetry': 'c2',
'order': 2,
'output_dir': 'outputs',
'num_designs': 1,
'deterministic': False,
'design_id': 2
}

generate_protein(config_homo)
Reading models from /root/RFdiffusion/rfdiffusion/inference/../../models
This is inf_conf.ckpt_path
/root/RFdiffusion/rfdiffusion/inference/../../models/Base_ckpt.pt
Assembling -model, -diffuser and -preprocess configs from checkpoint
USING MODEL CONFIG: self._conf[model][n_extra_block] = 4
USING MODEL CONFIG: self._conf[model][n_main_block] = 32
USING MODEL CONFIG: self._conf[model][n_ref_block] = 4
USING MODEL CONFIG: self._conf[model][d_msa] = 256
USING MODEL CONFIG: self._conf[model][d_msa_full] = 64
USING MODEL CONFIG: self._conf[model][d_pair] = 128
USING MODEL CONFIG: self._conf[model][d_templ] = 64
USING MODEL CONFIG: self._conf[model][n_head_msa] = 8
USING MODEL CONFIG: self._conf[model][n_head_pair] = 4
USING MODEL CONFIG: self._conf[model][n_head_templ] = 4
USING MODEL CONFIG: self._conf[model][d_hidden] = 32
USING MODEL CONFIG: self._conf[model][d_hidden_templ] = 32
USING MODEL CONFIG: self._conf[model][p_drop] = 0.15
USING MODEL CONFIG: self._conf[model][SE3_param_full] = {'num_layers': 1, 'num_channels': 32, 'num_degrees': 2, 'n_heads': 4, 'div': 4, 'l0_in_features': 8, 'l0_out_features': 8, 'l1_in_features': 3, 'l1_out_features': 2, 'num_edge_features': 32}
USING MODEL CONFIG: self._conf[model][SE3_param_topk] = {'num_layers': 1, 'num_channels': 32, 'num_degrees': 2, 'n_heads': 4, 'div': 4, 'l0_in_features': 64, 'l0_out_features': 64, 'l1_in_features': 3, 'l1_out_features': 2, 'num_edge_features': 64}
USING MODEL CONFIG: self._conf[model][freeze_track_motif] = False
USING MODEL CONFIG: self._conf[model][use_motif_timestep] = True
USING MODEL CONFIG: self._conf[diffuser][T] = 50
USING MODEL CONFIG: self._conf[diffuser][b_0] = 0.01
USING MODEL CONFIG: self._conf[diffuser][b_T] = 0.07
USING MODEL CONFIG: self._conf[diffuser][schedule_type] = linear
USING MODEL CONFIG: self._conf[diffuser][so3_type] = igso3
USING MODEL CONFIG: self._conf[diffuser][crd_scale] = 0.25
USING MODEL CONFIG: self._conf[diffuser][so3_schedule_type] = linear
USING MODEL CONFIG: self._conf[diffuser][min_b] = 1.5
USING MODEL CONFIG: self._conf[diffuser][max_b] = 2.5
USING MODEL CONFIG: self._conf[diffuser][min_sigma] = 0.02
USING MODEL CONFIG: self._conf[diffuser][max_sigma] = 1.5
USING MODEL CONFIG: self._conf[preprocess][sidechain_input] = False
USING MODEL CONFIG: self._conf[preprocess][motif_sidechain_input] = True
USING MODEL CONFIG: self._conf[preprocess][d_t1d] = 22
USING MODEL CONFIG: self._conf[preprocess][d_t2d] = 44
USING MODEL CONFIG: self._conf[preprocess][prob_self_cond] = 0.5
USING MODEL CONFIG: self._conf[preprocess][str_self_cond] = True
USING MODEL CONFIG: self._conf[preprocess][predict_previous] = False
Successful diffuser __init__
With this beta schedule (linear schedule, beta_0 = 0.04, beta_T = 0.28), alpha_bar_T = 0.00013696048699785024
'outputs/design_2'
代码
文本

Binder设计

生成一个长度为50的binder,绑定到PDB 4N5T的链A

代码
文本
[32]
# 首先下载PDB文件
import os
from Bio import PDB

def download_pdb(pdb_id):
"""下载PDB文件到本地"""
pdbl = PDB.PDBList()
pdbl.retrieve_pdb_file(pdb_id, pdir='./pdbs', file_format='pdb')
# PDB文件会被下载为pdb{id}.ent格式
old_name = f"./pdbs/pdb{pdb_id.lower()}.ent"
new_name = f"./pdbs/{pdb_id}.pdb"
if os.path.exists(old_name):
os.rename(old_name, new_name)
return new_name

def check_pdb_chains(pdb_file):
"""检查PDB文件中的链和残基信息"""
parser = PDB.PDBParser()
structure = parser.get_structure('temp', pdb_file)
print(f"PDB文件 {pdb_file} 的链和残基信息:")
for model in structure:
for chain in model:
residues = list(chain)
print(f"链 {chain.id}: {len(residues)} 残基")
print(f"第一个残基: {residues[0].id[1]}")
print(f"最后一个残基: {residues[-1].id[1]}")
print("---")

# 创建目录
os.makedirs('./pdbs', exist_ok=True)

# 下载并检查PDB
pdb_file = download_pdb('4N5T')
check_pdb_chains(pdb_file)
Downloading PDB structure '4n5t'...
PDB文件 ./pdbs/4N5T.pdb 的链和残基信息:
链 A: 191 残基
第一个残基: 17
最后一个残基: 301
---
链 B: 39 残基
第一个残基: 16
最后一个残基: 124
---
/opt/mamba/envs/SE3nv/lib/python3.9/site-packages/Bio/PDB/StructureBuilder.py:100: PDBConstructionWarning: WARNING: Chain A is discontinuous at line 1234.
  warnings.warn(
/opt/mamba/envs/SE3nv/lib/python3.9/site-packages/Bio/PDB/StructureBuilder.py:100: PDBConstructionWarning: WARNING: Chain B is discontinuous at line 1335.
  warnings.warn(
代码
文本
[34]
config_binder = {
'contigs': 'A17-36/0 50-50', # 格式: [target chain/residues]/0 [binder length]
'pdb': '4N5T',
'symmetry': 'none',
'order': 1,
'output_dir': 'outputs',
'num_designs': 1,
'deterministic': False,
'design_id': 3
}

generate_protein(config_binder)
Reading models from /root/RFdiffusion/rfdiffusion/inference/../../models
This is inf_conf.ckpt_path
/root/RFdiffusion/rfdiffusion/inference/../../models/Base_ckpt.pt
Assembling -model, -diffuser and -preprocess configs from checkpoint
USING MODEL CONFIG: self._conf[model][n_extra_block] = 4
USING MODEL CONFIG: self._conf[model][n_main_block] = 32
USING MODEL CONFIG: self._conf[model][n_ref_block] = 4
USING MODEL CONFIG: self._conf[model][d_msa] = 256
USING MODEL CONFIG: self._conf[model][d_msa_full] = 64
USING MODEL CONFIG: self._conf[model][d_pair] = 128
USING MODEL CONFIG: self._conf[model][d_templ] = 64
USING MODEL CONFIG: self._conf[model][n_head_msa] = 8
USING MODEL CONFIG: self._conf[model][n_head_pair] = 4
USING MODEL CONFIG: self._conf[model][n_head_templ] = 4
USING MODEL CONFIG: self._conf[model][d_hidden] = 32
USING MODEL CONFIG: self._conf[model][d_hidden_templ] = 32
USING MODEL CONFIG: self._conf[model][p_drop] = 0.15
USING MODEL CONFIG: self._conf[model][SE3_param_full] = {'num_layers': 1, 'num_channels': 32, 'num_degrees': 2, 'n_heads': 4, 'div': 4, 'l0_in_features': 8, 'l0_out_features': 8, 'l1_in_features': 3, 'l1_out_features': 2, 'num_edge_features': 32}
USING MODEL CONFIG: self._conf[model][SE3_param_topk] = {'num_layers': 1, 'num_channels': 32, 'num_degrees': 2, 'n_heads': 4, 'div': 4, 'l0_in_features': 64, 'l0_out_features': 64, 'l1_in_features': 3, 'l1_out_features': 2, 'num_edge_features': 64}
USING MODEL CONFIG: self._conf[model][freeze_track_motif] = False
USING MODEL CONFIG: self._conf[model][use_motif_timestep] = True
USING MODEL CONFIG: self._conf[diffuser][T] = 50
USING MODEL CONFIG: self._conf[diffuser][b_0] = 0.01
USING MODEL CONFIG: self._conf[diffuser][b_T] = 0.07
USING MODEL CONFIG: self._conf[diffuser][schedule_type] = linear
USING MODEL CONFIG: self._conf[diffuser][so3_type] = igso3
USING MODEL CONFIG: self._conf[diffuser][crd_scale] = 0.25
USING MODEL CONFIG: self._conf[diffuser][so3_schedule_type] = linear
USING MODEL CONFIG: self._conf[diffuser][min_b] = 1.5
USING MODEL CONFIG: self._conf[diffuser][max_b] = 2.5
USING MODEL CONFIG: self._conf[diffuser][min_sigma] = 0.02
USING MODEL CONFIG: self._conf[diffuser][max_sigma] = 1.5
USING MODEL CONFIG: self._conf[preprocess][sidechain_input] = False
USING MODEL CONFIG: self._conf[preprocess][motif_sidechain_input] = True
USING MODEL CONFIG: self._conf[preprocess][d_t1d] = 22
USING MODEL CONFIG: self._conf[preprocess][d_t2d] = 44
USING MODEL CONFIG: self._conf[preprocess][prob_self_cond] = 0.5
USING MODEL CONFIG: self._conf[preprocess][str_self_cond] = True
USING MODEL CONFIG: self._conf[preprocess][predict_previous] = False
Successful diffuser __init__
With this beta schedule (linear schedule, beta_0 = 0.04, beta_T = 0.28), alpha_bar_T = 0.00013696048699785024
'outputs/design_3'
代码
文本

参考资料

  1. Watson, Joseph L., et al. "Broadly applicable and accurate protein design by integrating structure prediction networks and diffusion generative models." BioRxiv (2022): 2022-12.
  2. 项目GitHub仓库: https://github.com/RosettaCommons/RFdiffusion
代码
文本
双击即可修改
代码
文本
python
protein design
pythonprotein design
点个赞吧