新建
高斯输出文件解析: NMR数据的提取与处理

CLiu

推荐镜像 :Basic Image:ubuntu22.04-py3.10-irkernel-r4.4.1
推荐机型 :c2_m4_cpu
赞
数据集
nmr(v1)
NMR数据的提取与处理
代码
文本
[2]
import re
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
# 提取所有原子的各向同性屏蔽常数
def extract_shielding_constants(log_content):
shielding_data = []
capture = False
for line in log_content:
if "SCF GIAO Magnetic shielding tensor" in line:
capture = True
continue
if capture:
parts = line.split()
if len(parts) >= 5 and parts[2] == 'Isotropic':
atom_index = int(parts[0])
element = parts[1]
isotropic = float(parts[4])
shielding_data.append({
'Atom Index': atom_index,
'Element': element,
'Isotropic': isotropic
})
return shielding_data
# 根据参考TMS值计算化学位移
def calculate_chemical_shifts(shielding_data, reference_shielding):
for atom in shielding_data:
atom['Chemical Shift'] = reference_shielding - atom['Isotropic']
return shielding_data
# 生成洛伦兹函数
def lorentzian(x, x0, fwhm):
return (1 / np.pi) * (fwhm / 2) / ((x - x0)**2 + (fwhm / 2)**2)
# 绘制NMR谱图,并应用洛伦兹展宽
def plot_nmr_spectrum(df, element, fwhm, points=1000):
# 获取化学位移的最大值和最小值
min_shift = df['Chemical Shift'].min() - 1.5 # 为了美观,扩展10个单位
max_shift = df['Chemical Shift'].max() + 10
x = np.linspace(min_shift, max_shift, points) # 根据化学位移动态设置范围
spectrum = np.zeros_like(x)
for shift in df['Chemical Shift']:
spectrum += lorentzian(x, shift, fwhm)
plt.plot(x, spectrum, label=f'{element}-NMR Spectrum')
plt.xlabel('Chemical Shift (ppm)')
plt.ylabel('Intensity')
plt.title(f'{element}-NMR Spectrum with Lorentzian Broadening')
plt.gca().invert_xaxis() # NMR谱图通常化学位移从右到左递增
plt.show()
# 读取log文件内容
log_file_path = '/bohr/nmr-vl86/v1/133764.log'
with open(log_file_path, 'r') as file:
log_content = file.readlines()
# 提取所有原子的屏蔽常数
all_shielding_data = extract_shielding_constants(log_content)
#B3LYP/6-31G(2df,p)进行几何优化,mPW1PW91/6-311 + G(2d,p) level进行NMR计算TMS
#C的化学位移是186.9704
#H的化学位移是31.7617
reference_shielding = 186.9704
# 根据元素过滤,例如 'C' 对应 13C NMR
element = 'C'
carbon_shielding_data = [atom for atom in all_shielding_data if atom['Element'] == element]
# 计算化学位移
carbon_shielding_data = calculate_chemical_shifts(carbon_shielding_data, reference_shielding)
# 转换为DataFrame便于查看
df_carbon_shifts = pd.DataFrame(carbon_shielding_data)
# 显示化学位移数据
print(df_carbon_shifts)
# 设置半高宽 FWHM 和绘制 NMR 谱图
fwhm = 0.2 # 设置为0.5 ppm的半高宽
plot_nmr_spectrum(df_carbon_shifts, element, fwhm)
Atom Index Element Isotropic Chemical Shift 0 1 C 126.3603 60.6101 1 3 C 104.4717 82.4987 2 4 C 122.9321 64.0383 3 6 C 54.8650 132.1054
代码
文本
[ ]
import os
import re
import pandas as pd
import numpy as np
from tqdm import tqdm
# 提取所有原子的各向同性屏蔽常数
def extract_shielding_constants(log_content, element):
shielding_data = []
capture = False
for line in log_content:
if "SCF GIAO Magnetic shielding tensor" in line:
capture = True
continue
if capture:
parts = line.split()
if len(parts) >= 5 and parts[2] == 'Isotropic' and parts[1] == element:
isotropic = float(parts[4])
shielding_data.append(isotropic)
return shielding_data
# 根据参考TMS值计算化学位移
def calculate_chemical_shifts(isotropic_data, reference_shielding):
return [reference_shielding - iso for iso in isotropic_data]
# 递归获取指定路径下的所有 .log 文件
def get_log_files(directory):
log_files = []
for root, _, files in os.walk(directory):
for file in files:
if file.endswith('.log'):
log_files.append(os.path.join(root, file))
return log_files
# 读取并处理单个 .log 文件
def process_single_log_file(log_file, element, reference_shielding):
with open(log_file, 'r') as file:
log_content = file.readlines()
isotropic_data = extract_shielding_constants(log_content, element)
return calculate_chemical_shifts(isotropic_data, reference_shielding)
# 遍历所有 .log 文件,获取化学位移范围
def get_min_max_shifts(directory, element, reference_shielding):
log_files = get_log_files(directory)
max_shift, min_shift = None, None
# 遍历文件提取最大和最小化学位移值
for log_file in tqdm(log_files, desc="Extracting min and max chemical shifts"):
chemical_shifts = process_single_log_file(log_file, element, reference_shielding)
if chemical_shifts:
max_shift = max(chemical_shifts) if max_shift is None else max(max_shift, max(chemical_shifts))
min_shift = min(chemical_shifts) if min_shift is None else min(min_shift, min(chemical_shifts))
return min_shift, max_shift
# 主函数
def main():
directory_path = 'D:/Spec2Mol/NMR/nomad_raw_files' # 设置您的 .log 文件路径
element = 'C' # 设置需要处理的元素,比如 'C' 对应 13C NMR
reference_shielding = 186.9704 # 参考TMS值
output_file = 'aligned_C_NMR_spectra.csv' # 输出 CSV 文件名
points = 1000 # 插值点的数量
# 第一步:提取所有文件中的最大和最小化学位移值
min_shift, max_shift = get_min_max_shifts(directory_path, element, reference_shielding)
print(f"全局最小化学位移: {min_shift} ppm, 全局最大化学位移: {max_shift} ppm")
# 执行主函数
if __name__ == '__main__':
main()
代码
文本
[ ]
import os
import re
import pandas as pd
import numpy as np
from tqdm import tqdm
# 提取所有原子的各向同性屏蔽常数
def extract_shielding_constants(log_content, element):
shielding_data = []
capture = False
for line in log_content:
if "SCF GIAO Magnetic shielding tensor" in line:
capture = True
continue
if capture:
parts = line.split()
if len(parts) >= 5 and parts[2] == 'Isotropic' and parts[1] == element:
isotropic = float(parts[4])
shielding_data.append(isotropic)
return shielding_data
# 根据参考TMS值计算化学位移
def calculate_chemical_shifts(isotropic_data, reference_shielding):
return [reference_shielding - iso for iso in isotropic_data]
# 递归获取指定路径下的所有 .log 文件
def get_log_files(directory):
log_files = []
for root, _, files in os.walk(directory):
for file in files:
if file.endswith('.log'):
log_files.append(os.path.join(root, file))
return log_files
# 生成洛伦兹展宽函数
def lorentzian(x, x0, fwhm):
return (1 / np.pi) * (fwhm / 2) / ((x - x0)**2 + (fwhm / 2)**2)
# 对化学位移数据进行洛伦兹展宽并采样强度
def apply_lorentzian_broadening(chemical_shifts, x_common, fwhm=0.5):
spectrum = np.zeros_like(x_common)
# 对每个化学位移点进行洛伦兹展宽
for shift in chemical_shifts:
spectrum += lorentzian(x_common, shift, fwhm)
return spectrum
# 处理所有 .log 文件并对齐采样,处理为每10000个保存一次
def process_all_log_files_in_batches(directory, element, reference_shielding, x_common, fwhm=0.5, batch_size=10000):
log_files = get_log_files(directory)
aligned_data = [] # 存储对齐后的数据
file_names = [] # 存储每个 .log 文件名
batch_count = 0 # 计数每个批次的数量
file_batch = 0 # 文件批次编号
# 遍历文件进行洛伦兹展宽和对齐采样
for i, log_file in enumerate(tqdm(log_files, desc="Aligning spectra")):
chemical_shifts = process_single_log_file(log_file, element, reference_shielding)
# 对当前文件的化学位移进行洛伦兹展宽,并在 x_common 上采样强度
broadened_spectrum = apply_lorentzian_broadening(chemical_shifts, x_common, fwhm)
aligned_data.append(broadened_spectrum)
file_names.append(os.path.basename(log_file)) # 存储文件名
# 每达到 batch_size 个文件,保存当前批次并清空
if (i + 1) % batch_size == 0:
file_batch += 1
output_file = f'aligned_C_NMR_batch_{file_batch}.csv'
save_aligned_data_to_csv(file_names, aligned_data, x_common, output_file)
print(f"Batch {file_batch} saved with {len(file_names)} files.")
aligned_data.clear() # 清空数据
file_names.clear() # 清空文件名
# 处理剩余的文件
if aligned_data:
file_batch += 1
output_file = f'aligned_C_NMR_batch_{file_batch}.csv'
save_aligned_data_to_csv(file_names, aligned_data, x_common, output_file)
print(f"Final batch {file_batch} saved with {len(file_names)} files.")
# 保存对齐后的数据到 CSV 文件
def save_aligned_data_to_csv(file_names, aligned_data, x_common, output_file):
# 创建 DataFrame,第一列为文件名,接着是化学位移
df = pd.DataFrame(aligned_data, index=file_names, columns=x_common).reset_index()
df.rename(columns={'index': 'File Name'}, inplace=True)
# 保存到 CSV 文件
df.to_csv(output_file, index=False)
print(f"Data successfully saved to {output_file}")
# 合并所有批次 CSV 文件
def merge_csv_files(output_file, batch_prefix='aligned_C_NMR_batch_'):
# 查找所有批次的 CSV 文件
csv_files = [f for f in os.listdir('.') if f.startswith(batch_prefix) and f.endswith('.csv')]
# 读取所有 CSV 文件并合并
df_list = []
for file in csv_files:
df = pd.read_csv(file)
df_list.append(df)
# 合并所有 DataFrame
final_df = pd.concat(df_list, ignore_index=True)
# 保存最终的合并文件
final_df.to_csv(output_file, index=False)
print(f"All batches merged into {output_file}")
# 主函数
def main():
directory_path = 'D:/Spec2Mol/NMR/nomad_raw_files' # 设置您的 .log 文件路径
element = 'C' # 设置需要处理的元素,比如 'C' 对应 13C NMR
#B3LYP/6-31G(2df,p)进行几何优化,mPW1PW91/6-311 + G(2d,p) level进行NMR计算TMS
#C的化学位移是186.9704
#H的化学位移是31.7617
reference_shielding = 186.9704 # 参考TMS值
output_file = 'aligned_C_NMR_spectra.csv' # 最终输出的大 CSV 文件
fwhm=0.5 #半高宽
#QM9中13C NMR的全局最小化学位移: -5.5090999999999894 ppm, 全局最大化学位移: 285.5935 ppm
# 设置化学位移范围,-5.6 到 285.6,每 0.1 ppm 采样
min_shift = -5.6
max_shift = 285.6
points = int((max_shift - min_shift) / 0.1) + 1 # 确定采样点的数量
x_common = np.linspace(min_shift, max_shift, points) # 化学位移的采样范围
# 处理所有 .log 文件,每10000个保存一次
process_all_log_files_in_batches(directory_path, element, reference_shielding, x_common, fwhm=0.5, batch_size=10000)
# 合并所有批次的 CSV 文件
merge_csv_files(output_file)
# 执行主函数
if __name__ == '__main__':
main()
代码
文本
[ ]
import os
import re
import pandas as pd
import numpy as np
from tqdm import tqdm
# 提取所有原子的各向同性屏蔽常数
def extract_shielding_constants(log_content, element):
shielding_data = []
capture = False
for line in log_content:
if "SCF GIAO Magnetic shielding tensor" in line:
capture = True
continue
if capture:
parts = line.split()
if len(parts) >= 5 and parts[2] == 'Isotropic' and parts[1] == element:
isotropic = float(parts[4])
shielding_data.append(isotropic)
return shielding_data
# 根据参考TMS值计算化学位移
def calculate_chemical_shifts(isotropic_data, reference_shielding):
return [reference_shielding - iso for iso in isotropic_data]
# 递归获取指定路径下的所有 .log 文件
def get_log_files(directory):
log_files = []
for root, _, files in os.walk(directory):
for file in files:
if file.endswith('.log'):
log_files.append(os.path.join(root, file))
return log_files
# 读取并处理单个 .log 文件
def process_single_log_file(log_file, element, reference_shielding):
with open(log_file, 'r') as file:
log_content = file.readlines()
isotropic_data = extract_shielding_constants(log_content, element)
return calculate_chemical_shifts(isotropic_data, reference_shielding)
# 遍历所有 .log 文件,获取化学位移范围
def get_min_max_shifts(directory, element, reference_shielding):
log_files = get_log_files(directory)
max_shift, min_shift = None, None
# 遍历文件提取最大和最小化学位移值
for log_file in tqdm(log_files, desc="Extracting min and max chemical shifts"):
chemical_shifts = process_single_log_file(log_file, element, reference_shielding)
if chemical_shifts:
max_shift = max(chemical_shifts) if max_shift is None else max(max_shift, max(chemical_shifts))
min_shift = min(chemical_shifts) if min_shift is None else min(min_shift, min(chemical_shifts))
return min_shift, max_shift
# 主函数
def main():
directory_path = 'D:/Spec2Mol/NMR/nomad_raw_files' # 设置您的 .log 文件路径
element = 'H' # 设置需要处理的元素,比如 'C' 对应 13C NMR
reference_shielding = 31.7617 # 参考TMS值
output_file = 'aligned_H_NMR_spectra.csv' # 输出 CSV 文件名
points = 1000 # 插值点的数量
# 第一步:提取所有文件中的最大和最小化学位移值
min_shift, max_shift = get_min_max_shifts(directory_path, element, reference_shielding)
print(f"全局最小化学位移: {min_shift} ppm, 全局最大化学位移: {max_shift} ppm")
# 执行主函数
if __name__ == '__main__':
main()
代码
文本
[ ]
import os
import re
import pandas as pd
import numpy as np
from tqdm import tqdm
# 提取所有原子的各向同性屏蔽常数
def extract_shielding_constants(log_content, element):
shielding_data = []
capture = False
for line in log_content:
if "SCF GIAO Magnetic shielding tensor" in line:
capture = True
continue
if capture:
parts = line.split()
if len(parts) >= 5 and parts[2] == 'Isotropic' and parts[1] == element:
isotropic = float(parts[4])
shielding_data.append(isotropic)
return shielding_data
# 根据参考TMS值计算化学位移
def calculate_chemical_shifts(isotropic_data, reference_shielding):
return [reference_shielding - iso for iso in isotropic_data]
# 递归获取指定路径下的所有 .log 文件
def get_log_files(directory):
log_files = []
for root, _, files in os.walk(directory):
for file in files:
if file.endswith('.log'):
log_files.append(os.path.join(root, file))
return log_files
# 生成洛伦兹展宽函数
def lorentzian(x, x0, fwhm):
return (1 / np.pi) * (fwhm / 2) / ((x - x0)**2 + (fwhm / 2)**2)
# 对化学位移数据进行洛伦兹展宽并采样强度
def apply_lorentzian_broadening(chemical_shifts, x_common, fwhm=0.5):
spectrum = np.zeros_like(x_common)
# 对每个化学位移点进行洛伦兹展宽
for shift in chemical_shifts:
spectrum += lorentzian(x_common, shift, fwhm)
return spectrum
# 处理所有 .log 文件并对齐采样,处理为每10000个保存一次
def process_all_log_files_in_batches(directory, element, reference_shielding, x_common, fwhm=0.5, batch_size=10000):
log_files = get_log_files(directory)
aligned_data = [] # 存储对齐后的数据
file_names = [] # 存储每个 .log 文件名
batch_count = 0 # 计数每个批次的数量
file_batch = 0 # 文件批次编号
# 遍历文件进行洛伦兹展宽和对齐采样
for i, log_file in enumerate(tqdm(log_files, desc="Aligning spectra")):
chemical_shifts = process_single_log_file(log_file, element, reference_shielding)
# 对当前文件的化学位移进行洛伦兹展宽,并在 x_common 上采样强度
broadened_spectrum = apply_lorentzian_broadening(chemical_shifts, x_common, fwhm)
aligned_data.append(broadened_spectrum)
file_names.append(os.path.basename(log_file)) # 存储文件名
# 每达到 batch_size 个文件,保存当前批次并清空
if (i + 1) % batch_size == 0:
file_batch += 1
output_file = f'aligned_H_NMR_batch_{file_batch}.csv'
save_aligned_data_to_csv(file_names, aligned_data, x_common, output_file)
print(f"Batch {file_batch} saved with {len(file_names)} files.")
aligned_data.clear() # 清空数据
file_names.clear() # 清空文件名
# 处理剩余的文件
if aligned_data:
file_batch += 1
output_file = f'aligned_H_NMR_batch_{file_batch}.csv'
save_aligned_data_to_csv(file_names, aligned_data, x_common, output_file)
print(f"Final batch {file_batch} saved with {len(file_names)} files.")
# 保存对齐后的数据到 CSV 文件
def save_aligned_data_to_csv(file_names, aligned_data, x_common, output_file):
# 创建 DataFrame,第一列为文件名,接着是化学位移
df = pd.DataFrame(aligned_data, index=file_names, columns=x_common).reset_index()
df.rename(columns={'index': 'File Name'}, inplace=True)
# 保存到 CSV 文件
df.to_csv(output_file, index=False)
print(f"Data successfully saved to {output_file}")
# 合并所有批次 CSV 文件
def merge_csv_files(output_file, batch_prefix='aligned_H_NMR_batch_'):
# 查找所有批次的 CSV 文件
csv_files = [f for f in os.listdir('.') if f.startswith(batch_prefix) and f.endswith('.csv')]
# 读取所有 CSV 文件并合并
df_list = []
for file in csv_files:
df = pd.read_csv(file)
df_list.append(df)
# 合并所有 DataFrame
final_df = pd.concat(df_list, ignore_index=True)
# 保存最终的合并文件
final_df.to_csv(output_file, index=False)
print(f"All batches merged into {output_file}")
# 主函数
def main():
directory_path = 'D:/Spec2Mol/NMR/nomad_raw_files' # 设置您的 .log 文件路径
element = 'H' # 设置需要处理的元素,比如 'C' 对应 13C NMR
#B3LYP/6-31G(2df,p)进行几何优化,mPW1PW91/6-311 + G(2d,p) level进行NMR计算TMS
#C的化学位移是186.9704
#H的化学位移是31.7617
reference_shielding = 31.7617 # 参考TMS值
output_file = 'aligned_H_NMR_spectra.csv' # 最终输出的大 CSV 文件
fwhm=0.5 #半高宽
#QM9中1H NMR的全局最小化学位移: -2.1031999999999975 ppm, 全局最大化学位移: 19.488300000000002 ppm
# 设置化学位移范围,-2.2 到 19.6,每 0.05 ppm 采样
min_shift = -2.2
max_shift = 19.6
points = int((max_shift - min_shift) / 0.05) + 1 # 确定采样点的数量
x_common = np.linspace(min_shift, max_shift, points) # 化学位移的采样范围
# 处理所有 .log 文件,每10000个保存一次
process_all_log_files_in_batches(directory_path, element, reference_shielding, x_common, fwhm=0.5, batch_size=10000)
# 合并所有批次的 CSV 文件
merge_csv_files(output_file)
# 执行主函数
if __name__ == '__main__':
main()
代码
文本
点个赞吧
本文被以下合集收录
分子光谱建模和预测

CLiu

更新于 2024-09-23
3 篇1 人关注
推荐阅读
公开
Plot lcurve in DeePMD-kit package
AnguseZhang

发布于 2024-03-06
4 赞
公开
训练之前:了解你的数据
咪咪猫骑士

更新于 2024-08-27