147 lines
5.7 KiB
Python
147 lines
5.7 KiB
Python
|
from Bio import Entrez, SeqIO, SearchIO
|
||
|
from Bio.Blast import NCBIWWW, NCBIXML
|
||
|
import os
|
||
|
import statistics
|
||
|
import time
|
||
|
|
||
|
# Ustawienie adresu e-mail dla API NCBI
|
||
|
Entrez.email = "baiobelfer@gmail.com"
|
||
|
|
||
|
# Ścieżki do plików
|
||
|
data_dir = "data"
|
||
|
log_dir = "_log"
|
||
|
out_dir = "out"
|
||
|
os.makedirs(log_dir, exist_ok=True)
|
||
|
os.makedirs(out_dir, exist_ok=True)
|
||
|
|
||
|
# Ścieżki do plików primerów i sekwencji
|
||
|
primer_ITS1_file = os.path.join(data_dir, "ITS1.fasta")
|
||
|
primer_ITS4_file = os.path.join(data_dir, "ITS4.fasta")
|
||
|
sequence_file = os.path.join(data_dir, "sequence.fasta")
|
||
|
|
||
|
# Ścieżka do pliku BLAST
|
||
|
blast_results_file = os.path.join(data_dir, "blast_results.xml")
|
||
|
|
||
|
# Funkcja do wczytywania primerów z plików FASTA
|
||
|
def load_primer(primer_file):
|
||
|
record = SeqIO.read(primer_file, "fasta")
|
||
|
return str(record.seq)
|
||
|
|
||
|
# Funkcja do wyszukiwania miejsc dopasowania primerów w sekwencji genomu
|
||
|
def find_primer_sites(genome_seq, primer_seq):
|
||
|
sites = []
|
||
|
primer_len = len(primer_seq)
|
||
|
for i in range(len(genome_seq) - primer_len + 1):
|
||
|
if genome_seq[i:i + primer_len] == primer_seq:
|
||
|
sites.append(i)
|
||
|
return sites
|
||
|
|
||
|
# Funkcja do analizy amplifikowanych regionów
|
||
|
def analyze_amplification(sequence_file, primer_its1, primer_its4, log_path):
|
||
|
amplification_stats = []
|
||
|
amplified_sequences = []
|
||
|
|
||
|
# Otwieranie pliku logowania
|
||
|
with open(log_path, "w") as log_file:
|
||
|
log_file.write("Amplified Regions:\n")
|
||
|
|
||
|
# Wczytanie genomu
|
||
|
genome = SeqIO.read(sequence_file, "fasta")
|
||
|
genome_seq = str(genome.seq)
|
||
|
|
||
|
# Wyszukiwanie dopasowań primerów
|
||
|
its1_sites = find_primer_sites(genome_seq, primer_its1)
|
||
|
its4_sites = find_primer_sites(genome_seq, primer_its4)
|
||
|
|
||
|
# Analiza amplifikacji między ITS1 i ITS4
|
||
|
for start in its1_sites:
|
||
|
for end in its4_sites:
|
||
|
if end > start:
|
||
|
amplified_region = genome_seq[start:end + len(primer_its4)]
|
||
|
amplified_len = len(amplified_region)
|
||
|
amplification_stats.append(amplified_len)
|
||
|
amplified_sequences.append(amplified_region)
|
||
|
log_file.write(f"Position {start}-{end + len(primer_its4)} | Length: {amplified_len} bp\n")
|
||
|
break
|
||
|
|
||
|
# Analiza statystyczna długości amplifikowanych regionów
|
||
|
if amplification_stats:
|
||
|
avg_length = statistics.mean(amplification_stats)
|
||
|
median_length = statistics.median(amplification_stats)
|
||
|
min_length = min(amplification_stats)
|
||
|
max_length = max(amplification_stats)
|
||
|
|
||
|
log_file.write("\nAmplification Statistics:\n")
|
||
|
log_file.write(f"Average Length: {avg_length} bp\n")
|
||
|
log_file.write(f"Median Length: {median_length} bp\n")
|
||
|
log_file.write(f"Min Length: {min_length} bp\n")
|
||
|
log_file.write(f"Max Length: {max_length} bp\n")
|
||
|
else:
|
||
|
log_file.write("No amplification regions found.\n")
|
||
|
|
||
|
return amplified_sequences
|
||
|
|
||
|
# Funkcja do wysyłania zapytania BLAST do NCBI
|
||
|
def perform_blast(sequences, blast_output_file):
|
||
|
# Łączenie wszystkich amplifikowanych sekwencji w jedno zapytanie
|
||
|
query_sequence = "\n".join(sequences)
|
||
|
|
||
|
# Wysyłanie zapytania BLAST
|
||
|
print("Wysyłanie zapytania BLAST do NCBI...")
|
||
|
result_handle = NCBIWWW.qblast("blastn", "nt", query_sequence)
|
||
|
|
||
|
# Zapisywanie wyników BLAST do pliku
|
||
|
with open(blast_output_file, "w") as out_handle:
|
||
|
out_handle.write(result_handle.read())
|
||
|
result_handle.close()
|
||
|
print(f"Wyniki BLAST zapisane do pliku {blast_output_file}")
|
||
|
|
||
|
# Funkcja do analizy wyników BLAST
|
||
|
def analyze_blast(blast_output_file, analysis_log_path):
|
||
|
print("Analiza wyników BLAST...")
|
||
|
with open(blast_output_file) as result_handle, open(analysis_log_path, "w") as log_file:
|
||
|
blast_records = NCBIXML.parse(result_handle)
|
||
|
for record in blast_records:
|
||
|
for alignment in record.alignments:
|
||
|
for hsp in alignment.hsps:
|
||
|
log_file.write("****HIT****\n")
|
||
|
log_file.write(f"Sequence: {alignment.hit_def}\n")
|
||
|
log_file.write(f"Length: {hsp.align_length}\n")
|
||
|
log_file.write(f"E-value: {hsp.expect}\n")
|
||
|
log_file.write(f"Score: {hsp.score}\n")
|
||
|
log_file.write(f"Identities: {hsp.identities}/{hsp.align_length}\n")
|
||
|
log_file.write("Query sequence:\n")
|
||
|
log_file.write(f"{hsp.query}\n")
|
||
|
log_file.write("Match:\n")
|
||
|
log_file.write(f"{hsp.match}\n")
|
||
|
log_file.write("Subject sequence:\n")
|
||
|
log_file.write(f"{hsp.sbjct}\n\n")
|
||
|
print(f"Analiza BLAST zakończona. Wyniki zapisane do {analysis_log_path}")
|
||
|
|
||
|
def main():
|
||
|
# Wczytywanie primerów
|
||
|
primer_ITS1 = load_primer(primer_ITS1_file)
|
||
|
primer_ITS4 = load_primer(primer_ITS4_file)
|
||
|
|
||
|
# Analiza amplifikacji i zapis logów
|
||
|
log_path = os.path.join(log_dir, "amplification_analysis_log.txt")
|
||
|
amplified_sequences = analyze_amplification(sequence_file, primer_ITS1, primer_ITS4, log_path)
|
||
|
|
||
|
print(f"Amplification analysis completed. Results saved to {log_path}")
|
||
|
|
||
|
if amplified_sequences:
|
||
|
# Wykonanie BLAST dla amplifikowanych regionów
|
||
|
perform_blast(amplified_sequences, blast_results_file)
|
||
|
|
||
|
# Analiza wyników BLAST
|
||
|
blast_analysis_log = os.path.join(log_dir, "blast_analysis_log.txt")
|
||
|
analyze_blast(blast_results_file, blast_analysis_log)
|
||
|
|
||
|
print(f"BLAST analysis completed. Results saved to {blast_analysis_log}")
|
||
|
else:
|
||
|
print("Brak amplifikowanych regionów do analizy BLAST.")
|
||
|
|
||
|
if __name__ == "__main__":
|
||
|
main()
|
||
|
|