genetics/alg/analysis.py

147 lines
5.7 KiB
Python
Raw Permalink Normal View History

2024-11-07 12:10:11 +00:00
from Bio import Entrez, SeqIO, SearchIO
from Bio.Blast import NCBIWWW, NCBIXML
import os
import statistics
import time
# Ustawienie adresu e-mail dla API NCBI
Entrez.email = "baiobelfer@gmail.com"
# Ścieżki do plików
2024-11-07 12:39:46 +00:00
data_dir = "../data"
log_dir = "../_log"
out_dir = "../out"
2024-11-07 12:10:11 +00:00
os.makedirs(log_dir, exist_ok=True)
os.makedirs(out_dir, exist_ok=True)
# Ścieżki do plików primerów i sekwencji
primer_ITS1_file = os.path.join(data_dir, "ITS1.fasta")
primer_ITS4_file = os.path.join(data_dir, "ITS4.fasta")
sequence_file = os.path.join(data_dir, "sequence.fasta")
# Ścieżka do pliku BLAST
blast_results_file = os.path.join(data_dir, "blast_results.xml")
# Funkcja do wczytywania primerów z plików FASTA
def load_primer(primer_file):
record = SeqIO.read(primer_file, "fasta")
return str(record.seq)
# Funkcja do wyszukiwania miejsc dopasowania primerów w sekwencji genomu
def find_primer_sites(genome_seq, primer_seq):
sites = []
primer_len = len(primer_seq)
for i in range(len(genome_seq) - primer_len + 1):
if genome_seq[i:i + primer_len] == primer_seq:
sites.append(i)
return sites
# Funkcja do analizy amplifikowanych regionów
def analyze_amplification(sequence_file, primer_its1, primer_its4, log_path):
amplification_stats = []
amplified_sequences = []
# Otwieranie pliku logowania
with open(log_path, "w") as log_file:
log_file.write("Amplified Regions:\n")
# Wczytanie genomu
genome = SeqIO.read(sequence_file, "fasta")
genome_seq = str(genome.seq)
# Wyszukiwanie dopasowań primerów
its1_sites = find_primer_sites(genome_seq, primer_its1)
its4_sites = find_primer_sites(genome_seq, primer_its4)
# Analiza amplifikacji między ITS1 i ITS4
for start in its1_sites:
for end in its4_sites:
if end > start:
amplified_region = genome_seq[start:end + len(primer_its4)]
amplified_len = len(amplified_region)
amplification_stats.append(amplified_len)
amplified_sequences.append(amplified_region)
log_file.write(f"Position {start}-{end + len(primer_its4)} | Length: {amplified_len} bp\n")
break
# Analiza statystyczna długości amplifikowanych regionów
if amplification_stats:
avg_length = statistics.mean(amplification_stats)
median_length = statistics.median(amplification_stats)
min_length = min(amplification_stats)
max_length = max(amplification_stats)
log_file.write("\nAmplification Statistics:\n")
log_file.write(f"Average Length: {avg_length} bp\n")
log_file.write(f"Median Length: {median_length} bp\n")
log_file.write(f"Min Length: {min_length} bp\n")
log_file.write(f"Max Length: {max_length} bp\n")
else:
log_file.write("No amplification regions found.\n")
return amplified_sequences
# Funkcja do wysyłania zapytania BLAST do NCBI
def perform_blast(sequences, blast_output_file):
# Łączenie wszystkich amplifikowanych sekwencji w jedno zapytanie
query_sequence = "\n".join(sequences)
# Wysyłanie zapytania BLAST
print("Wysyłanie zapytania BLAST do NCBI...")
result_handle = NCBIWWW.qblast("blastn", "nt", query_sequence)
# Zapisywanie wyników BLAST do pliku
with open(blast_output_file, "w") as out_handle:
out_handle.write(result_handle.read())
result_handle.close()
print(f"Wyniki BLAST zapisane do pliku {blast_output_file}")
# Funkcja do analizy wyników BLAST
def analyze_blast(blast_output_file, analysis_log_path):
print("Analiza wyników BLAST...")
with open(blast_output_file) as result_handle, open(analysis_log_path, "w") as log_file:
blast_records = NCBIXML.parse(result_handle)
for record in blast_records:
for alignment in record.alignments:
for hsp in alignment.hsps:
log_file.write("****HIT****\n")
log_file.write(f"Sequence: {alignment.hit_def}\n")
log_file.write(f"Length: {hsp.align_length}\n")
log_file.write(f"E-value: {hsp.expect}\n")
log_file.write(f"Score: {hsp.score}\n")
log_file.write(f"Identities: {hsp.identities}/{hsp.align_length}\n")
log_file.write("Query sequence:\n")
log_file.write(f"{hsp.query}\n")
log_file.write("Match:\n")
log_file.write(f"{hsp.match}\n")
log_file.write("Subject sequence:\n")
log_file.write(f"{hsp.sbjct}\n\n")
print(f"Analiza BLAST zakończona. Wyniki zapisane do {analysis_log_path}")
def main():
# Wczytywanie primerów
primer_ITS1 = load_primer(primer_ITS1_file)
primer_ITS4 = load_primer(primer_ITS4_file)
# Analiza amplifikacji i zapis logów
log_path = os.path.join(log_dir, "amplification_analysis_log.txt")
amplified_sequences = analyze_amplification(sequence_file, primer_ITS1, primer_ITS4, log_path)
print(f"Amplification analysis completed. Results saved to {log_path}")
if amplified_sequences:
# Wykonanie BLAST dla amplifikowanych regionów
perform_blast(amplified_sequences, blast_results_file)
# Analiza wyników BLAST
blast_analysis_log = os.path.join(log_dir, "blast_analysis_log.txt")
analyze_blast(blast_results_file, blast_analysis_log)
print(f"BLAST analysis completed. Results saved to {blast_analysis_log}")
else:
print("Brak amplifikowanych regionów do analizy BLAST.")
if __name__ == "__main__":
main()