將A序列著絲粒區(qū)域與其它品種比對(duì),查看特異性,這里我試著使用
將A序列拆分成1kb的序列,再將序列比對(duì)到合并后的其它品種序列,流程如下:
首先將序列拆分成1kb的區(qū)段:
def parse_fasta(fasta_file):
"""解析FASTA文件,返回{標(biāo)題: 完整序列}的字典"""
sequences = {}
current_title = None
current_seq = []
with open(fasta_file, 'r') as f:
for line in f:
line = line.strip() # 去除首尾空白(包括換行符)
if not line:
continue # 跳過(guò)空行
if line.startswith('>'):
# 遇到新標(biāo)題行:保存上一條序列(如果存在)
if current_title is not None:
sequences[current_title] = ''.join(current_seq)
# 提取新標(biāo)題(保留>后的所有內(nèi)容,包括空格和描述)
current_title = line[1:] # 去掉開(kāi)頭的>
current_seq = [] # 重置序列列表
else:
# 序列行:拼接(忽略大小寫(xiě),保留原始字符)
current_seq.append(line.upper() if False else line) # 如需轉(zhuǎn)為大寫(xiě),將False改為T(mén)rue
# 保存最后一條序列
if current_title is not None and current_seq:
sequences[current_title] = ''.join(current_seq)
return sequences
def split_sequence(title, full_seq, chunk_size=1000):
"""將長(zhǎng)序列切分為指定長(zhǎng)度的片段,返回片段列表[(新標(biāo)題, 片段序列), ...]"""
seq_len = len(full_seq)
if seq_len == 0:
print(f"警告:標(biāo)題為 '{title}' 的序列為空,已跳過(guò)。")
return []
fragments = []
part_num = 1
start = 1 # 生物學(xué)序列通常用1-based索引
while start <= seq_len:
end = start + chunk_size - 1
if end > seq_len:
end = seq_len # 最后一個(gè)片段可能不足1000bp
# 提取當(dāng)前片段(Python切片是0-based,所以需要轉(zhuǎn)換)
fragment_seq = full_seq[start-1:end] # start-1是0-based起始位置,end是0-based結(jié)束位置(不包含)
# 生成新標(biāo)題(包含原標(biāo)題、片段編號(hào)、位置范圍)
new_title = f"{title}_part{part_num}_{start}-{end}"
fragments.append((new_title, fragment_seq))
# 準(zhǔn)備下一個(gè)片段
part_num += 1
start = end + 1
return fragments
def format_sequence(seq, line_length=80):
"""將序列按指定長(zhǎng)度換行(默認(rèn)80字符/行),返回格式化后的多行字符串"""
return '\n'.join([seq[i:i+line_length] for i in range(0, len(seq), line_length)])
def main(input_file="IR64_merged.fasta", output_file="IR64_merged_split.fasta", chunk_size=1000):
# 1. 解析輸入FASTA文件
print(f"正在讀取文件:{input_file}")
try:
sequences = parse_fasta(input_file)
except FileNotFoundError:
print(f"錯(cuò)誤:文件 '{input_file}' 不存在,請(qǐng)檢查路徑。")
return
except Exception as e:
print(f"解析文件時(shí)出錯(cuò):{str(e)}")
return
if not sequences:
print("錯(cuò)誤:未從文件中解析到任何序列。")
return
print(f"成功解析 {len(sequences)} 條序列。")
# 2. 分割所有序列并寫(xiě)入輸出文件
with open(output_file, 'w') as f_out:
total_fragments = 0
for title, full_seq in sequences.items():
# 切分當(dāng)前序列
fragments = split_sequence(title, full_seq, chunk_size)
total_fragments += len(fragments)
# 寫(xiě)入每個(gè)片段
for new_title, frag_seq in fragments:
# 寫(xiě)入標(biāo)題行
f_out.write(f">{new_title}\n")
# 寫(xiě)入格式化的序列(每行80字符)
f_out.write(f"{format_sequence(frag_seq)}\n")
print(f"處理完成!共生成 {total_fragments} 個(gè)片段,已保存到 {output_file}")
if __name__ == "__main__":
# 可直接修改以下參數(shù):片段長(zhǎng)度、輸入/輸出文件名
main(
input_file="merged_all.fasta",
output_file="merged_all_split.fasta",
chunk_size=1000 # 如需修改片段長(zhǎng)度,調(diào)整此處(如500)
)
將其它基因組合并:
import os
def get_fasta_files():
"""獲取當(dāng)前目錄下所有 .fasta 和 .fa 文件(排除隱藏文件和子目錄)"""
fasta_extensions = ('.fasta', '.fa')
fasta_files = []
for filename in os.listdir('.'):
# 跳過(guò)隱藏文件(以.開(kāi)頭)和目錄
if filename.startswith('.') or os.path.isdir(filename):
continue
# 檢查擴(kuò)展名
if filename.lower().endswith(fasta_extensions):
fasta_files.append(filename)
return sorted(fasta_files) # 按文件名排序
def count_sequences_in_file(filepath):
"""統(tǒng)計(jì)單個(gè)FASTA文件中的序列數(shù)量(按>標(biāo)題行計(jì)數(shù))"""
count = 0
try:
with open(filepath, 'r') as f:
for line in f:
line = line.strip()
if line.startswith('>'):
count += 1
except Exception as e:
print(f"?? 讀取文件 {filepath} 時(shí)出錯(cuò):{str(e)},已跳過(guò)該文件")
return count
def check_duplicate_titles(fasta_files):
"""檢查所有文件中是否有重復(fù)的序列標(biāo)題,返回重復(fù)標(biāo)題字典"""
title_counts = {}
for filename in fasta_files:
try:
with open(filename, 'r') as f:
for line in f:
line = line.strip()
if line.startswith('>'):
title = line[1:] # 去掉>
if title in title_counts:
title_counts[title].append(filename)
else:
title_counts[title] = [filename]
except Exception as e:
continue # 忽略讀取錯(cuò)誤的文件
# 篩選出重復(fù)的標(biāo)題(出現(xiàn)次數(shù)>1)
duplicates = {title: files for title, files in title_counts.items() if len(files) > 1}
return duplicates
def merge_fasta_files(output_filename="merged_all.fasta"):
# 1. 獲取所有FASTA文件
fasta_files = get_fasta_files()
if not fasta_files:
print("? 未在當(dāng)前目錄找到任何 .fasta 或 .fa 文件")
return
# 2. 檢查重復(fù)標(biāo)題(提前提示)
duplicate_titles = check_duplicate_titles(fasta_files)
if duplicate_titles:
print(f"?? 警告:發(fā)現(xiàn) {len(duplicate_titles)} 個(gè)重復(fù)的序列標(biāo)題(可能來(lái)自不同文件):")
for title, files in list(duplicate_titles.items())[:5]: # 只顯示前5個(gè)
print(f" 標(biāo)題:{title[:50]}... 出現(xiàn)于:{', '.join(files)}")
if len(duplicate_titles) > 5:
print(f" ... 還有 {len(duplicate_titles)-5} 個(gè)重復(fù)標(biāo)題未顯示")
# 3. 合并文件
total_files = len(fasta_files)
total_sequences = 0
print(f"\n開(kāi)始合并 {total_files} 個(gè)FASTA文件...")
with open(output_filename, 'w') as out_f:
for idx, filename in enumerate(fasta_files, 1):
# 統(tǒng)計(jì)當(dāng)前文件的序列數(shù)
seq_count = count_sequences_in_file(filename)
if seq_count == 0:
print(f"?? 跳過(guò)空文件或格式異常文件:{filename}")
continue
# 寫(xiě)入當(dāng)前文件內(nèi)容
try:
with open(filename, 'r') as in_f:
# 讀取并寫(xiě)入所有行(保留原始格式,包括換行)
content = in_f.read()
# 確保文件末尾有換行,避免序列粘連
if not content.endswith('\n'):
content += '\n'
out_f.write(content)
total_sequences += seq_count
print(f"? 已合并 ({idx}/{total_files}):{filename}(包含 {seq_count} 條序列)")
except Exception as e:
print(f"? 合并文件 {filename} 失?。簕str(e)},已跳過(guò)")
# 4. 輸出結(jié)果統(tǒng)計(jì)
print(f"\n?? 合并完成!")
print(f" 合并文件總數(shù):{total_files} 個(gè)")
print(f" 合并序列總數(shù):{total_sequences} 條")
print(f" 結(jié)果已保存至:{output_filename}")
if duplicate_titles:
print(f" 注意:存在重復(fù)標(biāo)題,可能影響下游分析(見(jiàn)上文警告)")
if __name__ == "__main__":
# 可自定義輸出文件名(如修改為 "all_sequences.fasta")
merge_fasta_files(output_filename="merged_all.fasta")

比對(duì)結(jié)果表

image.png

結(jié)果統(tǒng)計(jì)

圖片及結(jié)果

使用minimap2比對(duì)結(jié)果: