좋은 소식. 이거는 전편 후편 없다. 나쁜 소식. 트리가 한타떄에 비해 안이쁘다.
# 모듈
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.cm as cm # 넌 뭐냐
from Bio import Entrez, SeqIO # 왼쪽: 일단 털어보자/오른쪽: 시퀀스 다루려면 필요합니다. 필수임.
from Bio import AlignIO # 서열 분석해줄 친구
from Bio import Phylo # 트리 그릴라면 필요해요
from Bio.Phylo.TreeConstruction import DistanceCalculator, DistanceTreeConstructor
import io # 누구세요?
import subprocess # 서브 프로세스(이건 또 뭐여...)
from collections import Counter
import re
# 그래프를 그리기 위한 기본 설정
plt.rcParams['font.family'] = 'Nanumbarunpen' # 나눔바른펜(본인 기본 고딕 싫어함)
# plt.rcParams['font.family'] = 'AppleGothic'
plt.rcParams['font.size'] = 14
plt.rcParams['axes.unicode_minus'] = False
# 사전세팅
Entrez.email = "blackholekun@gmail.com" # 이메일
muscle_exe = "/opt/homebrew/bin/muscle" # 이거 경로 있어야 써요(which 치면 나옴)
여긴 다 아니까 넘어갑시다. 본인은 고딕 자체를 딱딱해서 별로 안 좋아하지만 특히나 기본 고딕(맑은고딕, 애플고딕)을 증말 싫어하고 그나마 뭐 제출하는 자리에서 타협보는게 나눔고딕, 나눔스퀘어, 프리텐다드다. 기본고딕은 뭔가 기깔나지 아니함.
# 인플루엔자 H3N2 서열 가져오기
# 쿼리 조건: 인플루엔자 A, 특정 아형, HA 유전자, 최근 1년(2025), 호스트가 사람
query = f"Influenza A virus AND H3N2 AND HA[Gene Name] AND 2025[PDAT] AND Homo sapiens[Host]" # 사람독감 찾으려면...
# 1. ID 리스트 가져오기
handle = Entrez.esearch(db="nucleotide", term=query, retmax=160)
record = Entrez.read(handle)
id_list = record["IdList"]
handle.close()
# 2. 실제 서열 데이터 가져오기 (FASTA 형식)
fetch_handle = Entrez.efetch(db="nucleotide", id=id_list, rettype="fasta", retmode="text")
sequences = list(SeqIO.parse(fetch_handle, "fasta"))
fetch_handle.close()
# 3. 저장
with open("influenza_h3n2.fasta", "w") as f:
SeqIO.write(sequences, f, "fasta")
print(f"성공적으로 {len(sequences)}개의 서열을 가져왔습니다.")
print("----------")
for record in sequences[:3]:
print(f"ID: {record.id}")
print(f"Description: {record.description}")
print(f"Length: {len(record.seq)} bp\n")
쿼리가 좀 길죠? 일단 쿼리 내용을 뜯어보자.
Influenza A virus AND H3N2 AND HA[Gene Name] AND 2025[PDAT] AND Homo sapiens[Host]
1. Influenza A virus AND H3N2: 인플루엔자 H3N2(인플루엔자 바이러스는 밖에 나와있는 해마글루티닌과 뉴라미니데이스 번호로 지칭함... H5N1: 해마5 뉴라1)
2. HA[Gene Name]: 해마글루티닌으로 찾는다는 얘기다. 이놈은 게놈에 헤마글루티닌, 뉴라미니데이스, 뉴클레오프로틴, M1, M2, NS1, NS2(NEP), PA, PB1, PB1-F2, PB2가 있고 인플루엔자 데이터가 급나 많기때문에... 그리고 다른거 섞여들어가면 MSA 개발살남.
3. 2025[PDAT]: 작년에 유행했던 것들. 저걸 안 달면 온갖가지 시즌에 유행했던 인플루엔자 데이터가 다 나온다.
4. Homo sapiens[Host]: 숙주가 사람인... 그니까 사람한테 감염되는 바이러스 정보. 저거 안 하면 어떻게 되냐고요? 개 인플루엔자 나옵니다.
# 사전통계-어느 지역 데이터를 얼마나 긁어왔는가?
locations = []
for record in sequences:
# Description에서 괄호 안의 지역 정보 추출 (예: A/Shanghai/...)
match = re.search(r'A/([^/]+)/', record.description)
if match:
locations.append(match.group(1))
# 지역별 빈도수 확인
location_counts = Counter(locations)
print("--- 수집된 데이터 지역 분포 ---")
for loc, count in location_counts.most_common():
print(f"{loc}: {count}개")
갖고 온 데이터에서 유행한 지역 비율을 구하는 코드이다. 콜로라도 몇건, 상하이 몇건, 이런 식으로.
--- 수집된 데이터 지역 분포 ---
Rhode Island: 36개
Michigan: 21개
New York: 16개
Wisconsin: 11개
Pennsylvania: 11개
Izmir: 10개
Massachusetts: 9개
Virginia: 7개
North Carolina: 7개
Tennessee: 7개
Shanghai: 6개
Colorado: 6개
Maryland: 6개
California: 5개
Minnesota: 5개
Washington: 4개
Vermont: 4개
Texas: 3개
North Dakota: 3개
Nebraska: 3개
New Jersey: 3개
Nevada: 2개
Missouri: 2개
West Virginia: 2개
Oregon: 2개
Jinan: 1개
Utah: 1개
New Mexico: 1개
Connecticut: 1개
Yigo: 1개
Ohio: 1개
New Hampshire: 1개
Louisiana: 1개
Kentucky: 1개
야 이건 웬일로 글자제한 안짤리네. (네이버 얘기)
# 지역별 라벨링(함수)
def clean_flu_labels(sequences):
for record in sequences:
# 1. 지역 추출 (A/지역/...)
loc_match = re.search(r'A/([^/]+)/', record.description)
location = loc_match.group(1) if loc_match else "Unknown"
# 2. 연도 추출 (4자리 숫자)
year_match = re.search(r'/(\d{4})', record.description)
year = year_match.group(1) if year_match else "XXXX"
# 3. 새로운 ID 생성 (예: 2023_Shanghai_H3N2)
# 나중에 트리에 그릴 때 가독성을 위해 짧고 강렬하게!
record.id = f"{year}_{location}"
record.description = record.id # 설명도 통일
return sequences
# 라벨 정리 실행
labeled_sequences = clean_flu_labels(sequences)
# 확인
for r in labeled_sequences[:5]:
print(r.id)
이건 내가 갖고 온 데이터가 언제, 어디서 유행한 인플루엔자에 대한 데이터인지 라벨링하는 절차다.
print('MSA start... ')
# MSA 분석 시-작
try:
result = subprocess.run([muscle_exe, "-align", "influenza_h3n2.fasta", "-output", "influenza_h3n2_muscle_aligned.fasta"], check=True, capture_output=True, text=True)
print("Completed. ")
except subprocess.CalledProcessError as e:
print(f"MSA failed: {e}")
finally:
alignment = AlignIO.read("influenza_h3n2_muscle_aligned.fasta", "fasta")
# 오래 걸리니까 이거 돌려놓고 잠깐 바람 쐬고 오십쇼
오래걸리니까 이거 돌려놓고 똥 함 때리고 오시면 되겠습니다. 저 예외처리에 파이널리는 마지막에 해야 하는거다.
print("====== MSA Result ======")
alignment = AlignIO.read("influenza_h3n2_muscle_aligned.fasta", "fasta") # FASTA 니네 확장자가 몇개냐...
for record in alignment:
print(f"{record.id[:15]:<15} : {record.seq[:100]}")
====== MSA Result ======
OP847931.1 : ---------------------------------ATGAAGACTATCATTGCTTTGAGCTACATTCTATGTCTGGTTTTCGCTCAAAAAATTCCTGGAAATG
OP847971.1 : ---------------------------------ATGAAGACTATCATTGCTTTGAGCTACACTCTATGTCTGGTTTTCGCTCAAAAAATTCCTGGAAATG
OP847947.1 : ---------------------------------ATGAAGACTATCATTGCTTTGAGCTACATTCTATGTCTGGTTTTCGCTCAAAAAATTCCTGGAAATG
PQ483286.1 : ------------------------------------------------------------------TGTCTGGTTTTCACTCAAAAAATTCCTGGAAATG
PX759059.1 : ---------------------------------ATGAAGGCTATCATTGCTTTGAGCAACATTCTATGTCTTGTTTTCGCTCAAAAAATACCTGGAAATG
PX759051.1 : ---------------------------------ATGAAGGCTATCATTGCTTTGAGCAACATTCTATGTCTTGTTTTCGCTCAAAAAATACCTGGAAATG
OP847923.1 : ---------------------------------ATGAAGACTATCATTGCTTTGAGCTACATTCTATGTCTGGTTTTCGCTCAAAAAATTCCTGGAAATG
PX782326.1 : ---------------------------------ATGAAGGCTATCATTGCTTTGAGCAACATTCTATGTCTTGTTTTCGCTCAAAACATACCTGGAAATG
PX710628.1 : ---------------------------------ATGAAGGCTATCATTGCTTTGAGCAACATTCTATGTCTTGTTTTCGCTCAAAAAATACCTGGAAATG
PX710585.1 : ---------------------------------ATGAAGGCTATCATTGCTTTGAGCAACATTCTATGTCTTGTTTTCGCTCAAAAAATACCTGGAAATG
PX710642.1 : ---------------------------------ATGAAGGCTATCATTGCTTTGAGCAACATTCTATGTCTTGTTTTCGCTCAAAACATACCTGGAAATG
PX782330.1 : ---------------------------------ATGAAGGCTATCATTGCTTTGAGCAACATTCTATGTCTTGTTTTCGCTCAAAACATACCTGGAAATG
PX710546.1 : ---------------------------------ATGAAGGCTATCATTGCTTTGAGCAACATTCTATGTCTTGTTTTCGCTCAAAACATACCTGGAAATG
PX710564.1 : ---------------------------------ATGAAGGCTATCATTGCTTTGAGCAACATTCTATGTCTTGTTTTCGCTCAAAACATACCTGGAAATG
PX714963.1 : ---------------------------------ATGAAGGCTATCATTGCTTTGAGCAACATTCTATGTCTTGTTTTCGCTCAAAACATACCTGGAAATG
PX782332.1 : ---------------------------------ATGAAGGCTATCATTGCTTTGAGCAACATTCTATGTCTTGTTTTCGCTCAAAACATACCTGGAAATG
OP847963.1 : ---------------------------------ATGAAGACTATCATTGCTTTGAGCTACATTCTATGTCTGGTTTTCGCTCAAAAAATTCCTGGAAATG
OP847939.1 : ---------------------------------ATGAAGACTATCATTGCTTTGAGCTACATTCTATGTCTGGTTTTCGCTCAAAAAATTCCTGGAAATG
PX708931.1 : ----------------GGATAATTCTATTAACCATGAAGGCTATCATTGCTTTGAGCAACATTCTATGTCTTGTTTTCGCTCAAAAAATACCTGGAAATG
PX708486.1 : ----------------GGATAATTCTATTAACCATGAAGGCTATCATTGCTTTGAGCAACATTCTATGTCTTGTTTTCGCTCAAAAAATACCTGGAAATG
PX708510.1 : ----------------GGATAATTCTATTAACCATGAAGGCTATCATTGCTTTGAGCAACATTCTATGTCTTGTTTTCGCTCAAAAAATACCTGGAAATG
OP848011.1 : ---------------------------------ATGAAGACTATCATTGCTTTGAGCTACATTCTATGTCTGGTTTTCGCTCAAAAAATTCCTGGAAATG
PX714959.1 : ---------------------------------ATGAAGGCTATCATTGCTTTGAGCAACATTCTATGTCTTGTTTTCGCTCAAAACATACCTGGAAATG
PX714956.1 : ---------------------------------ATGAAGGCTATCATTGCTTTGAGCAACATTCTATGTCTTGTTTTCGCTCAAAACATACCTGGAAATG
PX782341.1 : ---------------------------------ATGAAGGCTATCATTGCTTTGAGCAACATTCTATGTCTTGTTTTCGCTCAAGACATACCTGGAAATG
PX710622.1 : ---------------------------------ATGAAGGCTATCATTGCTTTGAGCAACATTCTATGTCTTGTTTTCGCTCAAAACATACCTGGAAATG
PX710532.1 : ---------------------------------ATGAAGGCTATCATTGCTTTGAGCAACATTCTATGTCTTGTTTTCGCTCAAAACATACCTGGAAATG
PX782343.1 : ---------------------------------ATGAAGGCTATCATTGCTTTGAGCAACATTCTATGTCTTGTTTTCGCTCAAAACATACCTGGAAATG
PX710611.1 : ---------------------------------ATGAAGGCTATCATTGCTTTGAGCAACATTCTATGTCTTGTTTTCGCTCAAAACATACCTGGAAATG
PX710558.1 : ---------------------------------ATGAAGGCTATCATTGCTTTGAGCAACATTCTATGTCTTGTTTTCGCTCAAAACATACCTGGAAATG
PX710521.1 : ---------------------------------ATGAAGGCTATCATTGCTTTGAGCAACATTCTATGTCTTGTTTTCGCTCAAAACATACCTGGAAATG
PX710635.1 : ---------------------------------ATGAAGGCTATCATTGCTTTGAGCAACATTCTATGTCTTGTTTTCGCTCAAAACATACCTGGAAATG
PX710515.1 : ---------------------------------ATGAAAGCTATCATTGCTTTGAGCAACATTCTATGTCTTGTTTTCGCTCAAAACATACCTGGAAATG
PX714962.1 : ---------------------------------ATGAAGGCTATCATTGCTTTGAGCAACATTCTATGTCTTGTTTTCGCTCAAAACATACCTGGAAATG
PX782328.1 : ---------------------------------ATGAAGGCTATCATTGCTTTGAGCAACATTCTATGTCTTGTTTTCGCTCAAAACATACCTGGAAATG
PX714955.1 : ---------------------------------ATGAAGGCTATCATTGCTTTGAGCAACATTCTATGTCTTGTTTTCGCTCAAAACATACCTGGAAATG
PX710579.1 : ---------------------------------ATGAAGGCTATCATTGCTTTGAGCAACATTCTATGTCTTGTTTTCGCTCAAAACATACCTGGAAATG
PX710504.1 : ---------------------------------ATGAAGGCTATCATTGCTTTGAGCAACATTCTATGTCTTGTTTTCGCTCAAAACATACCTGGAAATG
PX710495.1 : ---------------------------------ATGAAGGCTATCATTGCTTTGAGCAACATTCTATGTCTTGTTTTCGCTCAAAACATACCTGGAAATG
PX782342.1 : ---------------------------------ATGAAGGCTATCATTGCTTTGAGCAACATTCTATGTCTTGTTTTCGCTCAAAACATACCTGGAAATG
PX714960.1 : ---------------------------------ATGAAGGCTATCATTGCTTTGAGCAACATTCTATGTCTTGTTTTCGCTCAAAACATACCTGGAAATG
PX710600.1 : ---------------------------------ATGAAGGCTATCATTGCTTTGAGCAACATTCTATGTCTTGTTTTCGCTCAAAACATACCTGGAAATG
PX714958.1 : ---------------------------------ATGAAGGCTATCATTGCTTTGAGCAACATTCTATGTCTTGTTTTCGCTCAAAACATACCTGGAAATG
PX714965.1 : ---------------------------------ATGAAGGCTATCATTGCTTTGAGCAACATTCTATGTCTTGTTTTCGCTCAAAACATACCTGGAAATG
PX714964.1 : ---------------------------------ATGAAGGCTATCATTGCTTTGAGCAACATTCTATGTCTTGTTTTCGCTCAAAACATACCTGGAAATG
PX714961.1 : ---------------------------------ATGAAGGCTATCATTGCTTTGAGCAACATTCTATGTCTTGTTTTCGCTCAAAACATACCTGGAAATG
PX710574.1 : ---------------------------------ATGAAGGCTATCATTGCTTTGAGCAACATTCTATGTCTTGTTTTCGCTCAAAACATACCTGGAAATG
PX710536.1 : ---------------------------------ATGAAGGCTATCATTGCTTTGAGCAACATTCTATGTCTTGTTTTCGCTCAAAACATACCTGGAAATG
PX782337.1 : ---------------------------------ATGAAGGCTATCATTGCTTTGAGCAACATTCTATGTCTTGTTTTCGCTCAAAACATACCTGGAAATG
PX782331.1 : ---------------------------------ATGAAGGCTATCATTGCTTTGAGCAACATTCTATGTCTTGTTTTCGCTCAAAACATACCTGGAAATG
PX759243.1 : ---------------------------------ATGAAGGCTATCATTGCTTTGAGCAACATTCTATGTCTTGTTTTCGCTCAAAAAATACCTGGAAATG
PX782335.1 : ---------------------------------ATGAAGGCTATCATTGCTTTGAGCAACATTCTATGTCTTGTTTTCGCTCAAAACATACCTGGAAATG
PX782336.1 : ---------------------------------ATGAAGGCTATCATTGCTTTGAGCAACATTCTATGTCTTGTTTTCGCTCAAAACATACCTGGAAATG
OP847979.1 : ---------------------------------ATGAAGACTATCATTGCTTTGAGCTACATTCTATGTCTGGTTTTCGCTCAAAAAATTCCTGGAAATG
PX782344.1 : ---------------------------------ATGAAGGCTATCATTGCTTTGAGCAACATTCTATGTCTTGTTTTCGCTCAAAACATACCTGGAAATG
PX782339.1 : ---------------------------------ATGAAGGCTATCATTGCTTTGAGCAACATTCTATGTCTTGTTTTCGCTCAAAACATACCTGGAAATG
PX750759.1 : ---------------------------------ATGAAGGCTATCATTGCTTTGAGCAACATTCTATGTCTTGTTTTCGCTCAAAACATACCTGGAAATG
PX782333.1 : ---------------------------------ATGAAGGCTATCATTGCTTTGAGCAACATTCTATGTCTTGTTTTCGCTCAAAACATACCTGGAAATG
PX782340.1 : ---------------------------------ATGAAGGCTATCATTGCTTTGAGCAACATTCTATGTCTTGTTTTCGCTCAAAACATACCTGGAAATG
PX759290.1 : ---------------------------------ATGAAGGCTATCATTGCTTTGAGCAACATTCTATGTCTTGTTTTCGCTCAAAAAATACCTGGAAATG
PX759195.1 : ---------------------------------ATGAAGGCTATCATTGCTTTGAGCAACATTCTATGTCTTGTTTTCGCTCAAAAAATACCTGGAAATG
PX782327.1 : ---------------------------------ATGAAGGCTATCATTGCTTTGAGCAACATTCTATGTCTTGTTTTCGCTCAAAACATACCTGGAAATG
PX714954.1 : ---------------------------------ATGAAGGCTATCATTGCTTTGAGCAACATTCTATGTCTTGTTTTCGCTCAAAACATACCTGGAAATG
PX714957.1 : ---------------------------------ATGAAGGCTATCATTGCTTTGAGCAACATTCTATGTCTTGTTTTCGCTCAAAAAATACCTGGAAATG
PX759251.1 : ---------------------------------ATGAAGGCTATCATTGCTTTGAGCAACATTCTATGTCTTGTTTTCGCTCAAAAAATACCTGGAAATG
PX759219.1 : ---------------------------------ATGAAGGCTATCATTGCTTTGAGCAACATTCTATGTCTTGTTTTCGCTCAAAAAATACCTGGAAATG
PX759211.1 : ---------------------------------ATGAAGGCTATCATTGCTTTGAGCAACATTCTATGTCTTGTTTTCGCTCAAAAAATACCTGGAAATG
PX782325.1 : ---------------------------------ATGAAGGCTATCATTGCTTTGAGCAACATTCTATGTCTTGTTTTCGCTCAAAACATACCTGGAAATG
PX782338.1 : ---------------------------------ATGAAGGCTATCATTGCTTTGAGCAACATTCTATGTCTTGTTTTCGCTCAAAACATACCTGGAAATG
PX710592.1 : ---------------------------------ATGAAGGCTATCATTGCTTTGAGCAACATTCTATGTCTTGTTTTCGCTCAAAACATACCTGGAAATG
PX759235.1 : ---------------------------------ATGAAGGCTATCATTGCTTTGAGCAACATTCTATGTCTTGTTTTCGCTCAAAAAATACCTGGAAATG
PX759131.1 : ---------------------------------ATGAAGGCTATCATTGCTTTGAGCAACATTCTATGTCTTGTTTTCGCTCAAAAAATACCTGGAAATG
PX759227.1 : ---------------------------------ATGAAGGCTATCATTGCTTTGAGCAACATTCTATGTCTTGTTTTCGCTCAAAAAATACCTGGAAATG
PX759203.1 : ---------------------------------ATGAAGGCTATCATTGCTTTGAGCAACATTCTATGTCTTGTTTTCGCTCAAAAAATACCTGGAAATG
PX759115.1 : ---------------------------------ATGAAGGCTATCATTGCTTTGAGCAACATTCTATGTCTTGTTTTCGCTCAAAAAATACCTGGAAATG
PX759099.1 : ---------------------------------ATGAAGGCTATCATTGCTTTGAGCAACATTCTATGTCTTGTTTTCGCTCAAAAAATACCTGGAAATG
PX759091.1 : ---------------------------------ATGAAGGCTATCATTGCTTTGAGCAACATTCTATGTCTTGTTTTCGCTCAAAAAATACCTGGAAATG
PX759043.1 : ---------------------------------ATGAAGGCTATCATTGCTTTGAGCAACATTCTATGTCTTGTTTTCGCTCAAAAAATACCTGGAAATG
PX759275.1 : ---------------------------------ATGAAGGCTATCATTGCTTTGAGCAACATTCTATGTCTTGTTTTCGCTCAAAAAATACCTGGAAATG
PX759259.1 : ---------------------------------ATGAAGGCTATCATTGCTTTGAGCAACATTCTATGTCTTGTTTTCGCTCAAAAAATACCTGGAAATG
PX759083.1 : ---------------------------------ATGAAGGCTATCATTGCTTTGAGCAACATTCTATGTCTTGTTTTCGCTCAAAAAATACCTGGAAATG
PX759075.1 : ---------------------------------ATGAAGGCTATCATTGCTTTGAGCAACATTCTATGTCTTGTTTTCGCTCAAAAAATACCTGGAAATG
PX759298.1 : ---------------------------------ATGAAGGCTATCATTGCTTTGAGCAACATTCTATGTCTTGTTTTCGCTCAAAAAATACCTGGAAATG
PX759147.1 : ---------------------------------ATGAAGGCTATCATTGCTTTGAGCAACATTCTATGTCTTGTTTTCGCTCAAAAAATACCTGGAAATG
PX759027.1 : ---------------------------------ATGAAGGCTATCATTGCTTTGAGCAACATTCTATGTCTTGTTTTCGCTCAAAAAATACCTGGGAATG
PX759283.1 : ---------------------------------ATGAAGGCTATCATTGCTTTGAGCAACATTCTATGTCTTGTTTTCGCTCAAAAAATACCTGGAAATG
PX759267.1 : ---------------------------------ATGAAGGCTATCATTGCTTTGAGCAACATTCTATGTCTTGTTTTCGCTCAAAAAATACCTGGAAATG
PX782329.1 : ---------------------------------ATGAAGGCTATCATTGCTTTGAGCAACATTCTATGTCTTGTTTTCGCTCAAAACATACCTGGAAATG
PX759107.1 : ---------------------------------ATGAAGGCTATCATTGCTTTGAGCAACATTCTATGTCTTGTTTTCGCTCAAAAAATACCTGGAAATG
OP847955.1 : ---------------------------------ATGAAGACTATCATTGCTTTGAGCTACATTCTATGTCTGGTTTTCGCTCAAAAAATTCCTGGAAATG
PX759139.1 : ---------------------------------ATGAAGGCTATCATTGCTTTGAGCAACATTCTATGTCTTGTTTTCGCTCAAAAAATACCTGGAAATG
PX759067.1 : ---------------------------------ATGAAGGCTATCATTGCTTTGAGCAACATTCTATGTCTTGTTTTCGCTCAAAAAATACCTGGAAATG
PX759123.1 : ---------------------------------ATGAAGGCTATCATTGCTTTGAGCAACATTCTATGTCTTGTTTTCGCTCAAAAAATACCTGGAAATG
PV570273.1 : A----------GCAGGGGATAATTCTATTAACCATGAAGACTATCATTGCCTTGAGCTACACTCTATGTCTTGTTTTCGCTCAAAAAATCCCTGGAAATG
PX759187.1 : ---------------------------------ATGAAGGCTATCATTGCTTTGAGCAACATTCTATGTCTTGTTTTCGCTCAAAAAATACCTGGAAATG
PX759179.1 : ---------------------------------ATGAAGGCTATCATTGCTTTGAGCAACATTCTATGTCTTGTTTTCGCTCAAAAAATACCTGGAAATG
PX759171.1 : ---------------------------------ATGAAGGCTATCATTGCTTTGAGCAACATTCTATGTCTTGTTTTCGCTCAAAAAATACCTGGAAATG
PX759163.1 : ---------------------------------ATGAAGGCTATCATTGCTTTGAGCAACATTCTATGTCTTGTTTTCGCTCAAAAAATACCTGGAAATG
PX759155.1 : ---------------------------------ATGAAGGCTATCATTGCTTTGAGCAACATTCTATGTCTTGTTTTCGCTCAAAAAATACCTGGAAATG
PX759035.1 : ---------------------------------ATGAAGGCTATCATTGCTTTGAGCAACATTCTATGTCTTGTTTTCGCTCAAAAAATACCTGGAAATG
PX708636.1 : ----------------GGATAATTCTATTAACCATGAAGGCTATCATTGCTTTGAGCAACATTCTATGTCTTGTTTTCGCTCAAAAAATACCTGGAAATG
PX708915.1 : ----------------GGATAATTCTATTAACCATGAAGGCTATCATTGCTTTGAGCAACATTCTATGTCTTGTTTTCGCTCAAAAAATACCTGGAAATG
PX708446.1 : ----------------GGATAATTCTATTAACCATGAAGGCTATCATTGCTTTGAGCAACATTCTATGTCTTGTTTTCGCTCAAAAAATACCTGGAAATG
PX708574.1 : ----------------GGATAATTCTATTAACCATGAAGGCTATCATTGCTTTGAGCAACATTCTATGTCTTGTTTTCGCTCAAAAAATACCTGGAAAAG
PX709003.1 : ----------------GGATAATTCTATTAACCATGAAGGCTATCATTGCTTTGAGCAACATTCTATGTCTTGTTTTCGCTCAAAAAATACCTGGAAATG
OP848003.1 : ---------------------------------ATGAAGACTATCATTGCTTTGAGCTACATTCTATGTCTGGTTTTCGCTCAAAAAATTCCTGGAAATG
PX708820.1 : ----------------GGATAATTCTATTAACCATGAAGGCTATCATTGCTTTGAGCAACATTCTATGTCTTGTTTTCGCTCAAAAAATACCTGGAAATG
PX708454.1 : ----------------GGATAATTCTATTAACCATGAAGGCTATCATTGCTTTGAGCAACATTCTATGTCTTGTTTTCGCTCAAAAAATACCTGGAAATG
PX708620.1 : ----------------GGATAATTCTATTAACCATGAAGGCTATCATTGCCCTGAGCAACATTCTATGTCTTGTTTTCGCTCAAAAAATACCTGGAAATG
PX708668.1 : ----------------GGATAATTCTATTAACCATGAAGGCTATCATTGCTTTGAGCAACATTCTATGTCTTGTTTTCGCTCAAAAAATACCTGGAAATG
PX734586.1 : ----------------GGATAATTCTATTAACCATGAAGGCTATCATTGCTTTGAGCAACATTCTATGTCTTGTTTTCGCTCAAAACATACCTGGAAATG
PX734610.1 : ----------------GGATAATTCTATTAACCATGAAGGCTATCATTGCTTTGAGCAACATTCTATGTCTTGTTTTCGCTCAAAACATACCTGGAAATG
PX734602.1 : ----------------GGATAATTCTATTAACCATGAAGGCTATCATTGCTTTGAGCAACATTCTATGTCTTGTTTTCGCTCAAAACATACCTGGAAATG
PX734492.1 : ----------------GGATAATTCTATTAACCATGAAGGCTATCATTGCTTTGAGCAACATTCTATGTCTTGTTTTCGCTCAAAACATACCTGGAAATG
PX708374.1 : ----------------GGATAATTCTATTAACCATGAAGGCTATCATTGCTTTGAGCAACATTCTATGTCTTGTTTTCGCTCAAAACATACCTGGAAATG
PX708628.1 : ----------------GGATAATTCTATTAACCATGAAGGCTATCATTGCTTTGAGCAACATTCTATGTCTTGTTTTCGCTCAAAAAATACCTGGAAATG
PX708939.1 : ----------------GGATAATTCTATTAACCATGAAGGCTATCATTGCTTTGAGCAACATTCTATGTCTTGTTTTCGCTCAAAAAATACCTGGAAATG
PX708430.1 : ----------------GGATAATTCTATTAACCATGAAGGCTATCATTGCTTTGAGCAACATTCTATGTCTTGTTTTCGCTCAAAAAATACCTGGAAATG
PX708366.1 : ----------------GGATAATTCTATTAACCATGAAGGCTATCATTGCTTTGAGCAACATTCTATGTCTTGTTTTCGCTCAAAAAATACCTGGAAATG
PX708852.1 : ----------------GGATAATTCTATTAACCATGAAGGCTATCATTGCTTTGAGCAACATTCTATGTCTTGTTTTCGCTCAAAAAATACCTGGAAATG
PX708844.1 : ----------------GGATAATTCTATTAACCATGAAGGCTATCATTGCTTTGAGCAACATTCTATGTCTTGTTTTCGCTCAAAAAATACCTGGAAATG
PX708494.1 : ----------------GGATAATTCTATTAACCATGAAGGCTATCATTGCTTTGAGCAACATTCTATGTCTTGTTTTCGCTCAAAAAATACCTGGAAATG
PX708947.1 : ----------------GGATAATTCTATTAACCATGAAGGCTATCATTGCTTTGAGCAACATTCTATGTCTTGTTTTCGCTCAAAAAATACCTGGAAATG
PX734530.1 : ----------------GGATAATTCTATTAACCATGAAGGCTATCATTGCTTTGAGCAACATTCTATGTCTTGTTTTCGCTCAAAACATACCTGGAAATG
PX708550.1 : ----------------GGATAATTCTATTAACCATGAAGGCTATCATTGCTTTGAGCAACATTCTATGTCTTGTTTTCGCTCAAAAAATACCTGGAAATG
PX734546.1 : ----------------GGATAATTCTATTAACCATGAAGGCTATCATTGCTTTGAGCAACATTCTATGTCTTGTTTTCGCTCAAAACATACCTGGAAATG
PX734522.1 : ----------------GGATAATTCTATTAACCATGAAGGCTATCATTGCTTTGAGCAACATTCTATGTCTTGTTTTCGCTCAAAACATACCTGGAAATG
PX708907.1 : ----------------GGATAATTCTATTAACCATGAAGGCTATCATTGCTTTGAGCAACATTCTATGTCTTGTTTTCGCTCAAAAAATACCTGGAAATG
PX708438.1 : ----------------GGATAATTCTATTAACCATGAAGGCTATCATTGCTTTGAGCAACATTCTATGTCTTGTTTTCGCTCAAAAAATACCTGGAAATG
PX708796.1 : ----------------GGATAATTCTATTAACCATGAAGGCTATCATTGCTTTGAGCAACATTCTATGTCTTGTTTTCGCTCAAAAAATACCTGGAAATG
PX708995.1 : ----------------GGATAATTCTATTAACCATGAAGGCTATCATTGCTTTGAGCAACATTCTATGTCTTGTTTTCGCTCAAAAAATACCTGGAAATG
PX708923.1 : ----------------GGATAATTCTATTAACCATGAAGGCTATCATTGCTTTGAGCAACATTCTATGTCTTGTTTTCGCTCAAAAAATACCTGGAAATG
PX708700.1 : ----------------GGATAATTCTATTAACCATGAAGGCTATCATTGCTTTGAGCAACATTCTATGTCTTGTTTTCGCTCAAAAAATACCTGGAAATG
PX708382.1 : ----------------GGATAATTCTATTAACCATGAAGGCTATCATTGCTTTGAGCAACATTCTATGTCTTGTTTTCGCTCAAAACATACCTGGAAATG
PX734506.1 : ----------------GGATAATTCTATTAACCATGAAGGCTATCATTGCTTTGAGCAACATTCTATGTCTTGTTTTCGCTCAAAACATACCTGGAAATG
PX708955.1 : ----------------GGATAATTCTATTAACCATGAAGGCTATCATTGCTTTGAGCAACATTCTATGTCTTGTTTTCGCTCAAAACATACCTGGAAATG
PX708899.1 : ----------------GGATAATTCTATTAACCATGAAGGCTATCATTGCTTTGAGCAACATTCTATGTCTTGTTTTCGCTCAAAACATACCTGGAAATG
PX708756.1 : ----------------GGATAATTCTATTAACCATGAAGGCTATCATTGCTTTGAGCAACATTCTATGTCTTGTTTTCGCTCAAAACATACCTGGAAATG
PX708748.1 : ----------------GGATAATTCTATTAACCATGAAGGCTATCATTGCTTTGAGCAACATTCTATGTCTTGTTTTCGCTCAAAACATACCTGGAAATG
PX708740.1 : ----------------GGATAATTCTATTAACCATGAAGGCTATCATTGCTTTGAGCAACATTCTATGTCTTGTTTTCGCTCAAAACATACCTGGAAATG
PX708732.1 : ----------------GGATAATTCTATTAACCATGAAGGCTATCATTGCTTTGAGCAACATTCTATGTCTTGTTTTCGCTCAAAACATACCTGGAAATG
PX708724.1 : ----------------GGATAATTCTATTAACCATGAAGGCTATCATTGCTTTGAGCAACATTCTATGTCTTGTTTTCGCTCAAAACATACCTGGAAATG
PX708716.1 : ----------------GGATAATTCTATTAACCATGAAGGCTATCATTGCTTTGAGCAACATTCTATGTCTTGTTTTCGCTCAAAACATACCTGGAAATG
PX708518.1 : ----------------GGATAATTCTATTAACCATGAAGGCTATCATTGCTTTGAGCAACATTCTATGTCTTGTTTTCGCTCAAAACATACCTGGAAATG
PX708422.1 : ----------------GGATAATTCTATTAACCATGAAGGCTATCATTGCTTTGAGCAACATTCTATGTCTTGTTTTCGCTCAAAACATACCTGGAAATG
PX708684.1 : ----------------GGATAATTCTATTAACCATGAAGGCTATCATTGCTTTGAGCAACATTCTATGTCTTGTTTTCGCTCAAAAAATACCTGGAAATG
PX708804.1 : ----------------GGATAATTCTATTAACCATGAAGGCTATCATTGCTTTGAGCAACATTCTATGTCTTGTTTTCGCTCAAAAAATACCTGGAAATG
PX708676.1 : ----------------GGATAATTCTATTAACCATGAAGGCTATCATTGCTTTGAGCAACATTCTATGTCTTGTTTTCGCTCAAAAAATACCTGGAAATG
PX708534.1 : ----------------GGATAATTCTATTAACCATGAAGGCTATCATTGCTTTGAGCAACATTCTATGTCTTGTTTTCGCTCAAAAAATACCTGGAAATG
PX708582.1 : ----------------GGATAATTCTATTAACCATGAAGGCTATCATTGCTTTGAGCAACATTCTATGTCTTGTTTTCGCTCAAAACATACCTGGAAATG
PX734514.1 : ----------------GGATAATTCTATTAACCATGAAGGCTATCATTGCTTTGAGCAACATTCTATGTCTTGTTTTCGCTCAAAACATACCTGGAAATG
PX708708.1 : ----------------GGATAATTCTATTAACCATGAAGGCTATCATTGCTTTGAGCAACATTCTATGTCTTGTTTTCGCTCAAAACATACCTGGAAATG
PX708692.1 : ----------------GGATAATTCTATTAACCATGAAGGCTATCATTGCTTTGAGCAACATTCTATGTCTTGTTTTCGCTCAAAAAATACCTGGAAATG
PX734486.1 : ----------------GGATAATTCTATTAACCATGAAGGCTATCATTGCTTTGAGCAACATTCTATGTCTTGTTTTCGCTCAAAACATACCTGGAAATG
PX708764.1 : ----------------GGATAGTTCTATTAACCATGAAGGCTATCATTGCTTTGAGCAACATTCTATGTCTTGTTTTCGCTCAAAAAATACCTGGAAATG
PX708891.1 : ----------------GGATAATTCTATTAACCATGAAGGCTATCATTGCTTTGAGCAACATTCTATGTCTTGTTTTCGCTCAAAAAATACCTGGAAATG
PX708883.1 : ----------------GGATAATTCTATTAACCATGAAGGCTATCATTGCTTTGAGCAACATTCTATGTCTTGTTTTCGCTCAAAAAATACCTGGAAATG
PX708470.1 : ----------------GGATAATTCTATTAACCATGAAGGCTATCATTGCTTTGAGCAACATTCTATGTCTTGTTTTCGCTCAAAAAATACCTGGAAATG
PX734554.1 : ----------------GGATAATTCTATTAACCATGAAGGCTATTATTGCTTTGAGCAACATTCTATGTCTTGTTTTCGCTCAAAACATACCTGGAAATG
PX708526.1 : ----------------GGATAATTCTATTAACCATGAAGGCTATCATTGCTTTGAGCAACATTCTATGTCTTGTTTTCGCTCAAAAAATACCTGGAAATG
PX708828.1 : ----------------GGATAATTCTATTAACCATGAAGGCTATCATTGCTTTGAGCAACATTCTGTGTCTTGTTTTCGCTCAAAAAATACCTGGAAATG
PX708542.1 : ----------------GGATAATTCTATTAACCATGAAGGCTATCATTGCTTTGAGCAACATTCTATGTCTTGTTTTCGCTCAAAAAATACCTGGAAATG
PX708478.1 : ----------------GGATAATTCTATTAACCATGAAGGCTATCATTGCTTTGAGCAACATTCTATGTCTTGTTTTCGCTCAAAAAATACCTGGAAATG
PX708398.1 : ----------------GGATAATTCTATTAACCATGAAGGCTATCATTGCTTTGAGCAACATTCTATGTCTTGTTTTCGCTCAAAAAATACCTGGAAATG
PX708390.1 : ----------------GGATAATTCTATTAACCATGAAGGCTATCATTGCTTTGAGCAACATTCTATGTCTTGTTTTCGCTCAAAAAATACCTGGAAATG
PX708652.1 : ----------------GGATAATTCTATTAACCATGAAGGCTATCATTGCTTTGAGCAACATTCTATGTCTTGTTTTCGCTCAAAAAATACCTGGAAATG
PX708788.1 : ----------------GGATAATTCTATTAACCATGAAGGCTATCATTGCTTTGAGCAACATTCTATGTCTTGTTTTCGCTCAAAAAATACCTGGAAATG
PX708772.1 : ----------------GGATAATTCTATTAACCATGAAGGCTATCATTGCTTTGAGCAACATTCTATGTCTTGTTTTCGCTCAAAAAATACCTGGAAATG
PX708566.1 : ----------------GGATAATTCTATTAACCATGAAGGCTATCATTGCTTTGAGCAACATTCTATGTCTTGTTTTCGCTCAAAAAATACCTGGAAATG
PX708358.1 : ----------------GGATAATTCTATTAACCATGAAGGCTATCATTGCTTTGAGCAACATTCTATGTCTTGTTTTCGCTCAAAAAATACCTGGAAATG
PX734499.1 : ----------------GGATAATTCTATTAACCATGAAGGCTATCATTGCTTTGAGCAACATTCTATGTCTTGTTTTCGCTCAAAAAATACCTGGAAATG
PX708836.1 : ----------------GGATAATTCTATTAACCATGAAGGCTATCATTGCTTTGAGCAACATTCTGTGTCTTGTTTTCGCTCAAAAAATACCTGGAAATG
PX708780.1 : ----------------GGATAATTCTATTAACCATGAAGGCTATCATTGCTTTGAGCAACATTCTATGTCTTGTTTTCGCTCAAAAAATACCTGGAAATG
PX708660.1 : ----------------GGATAATTCTATTAACCATGAAGGCTATCATTGCTTTGAGCAACATTCTATGTCTTGTTTTCGCTCAAAAAATACCTGGAAATG
PX708558.1 : ----------------GGATAATTCTATTAACCATGAAGGCTATCATTGCTTTGAGCAACATTCTATGTCTTGTTTTCGCTCAAAAAATACCTGGAAATG
PX708987.1 : ----------------GGATAATTCTATTAACCATGAAGGCTATCATTGCTTTGAGCAACATTCTATGTCTTGTTTTCGCTCAAAACATACCTGGAAATG
PX708979.1 : ----------------GGATAATTCTATTAACCATGAAGGCTATCATTGCTTTGAGCAACATTCTATGTCTTGTTTTCGCTCAAAACATACCTGGAAATG
PX708971.1 : ----------------GGATAATTCTATTAACCATGAAGGCTATCATTGCTTTGAGCAACATTCTATGTCTTGTTTTCGCTCAAAACATACCTGGAAATG
PX708963.1 : ----------------GGATAATTCTATTAACCATGAAGGCTATCATTGCTTTGAGCAACATTCTATGTCTTGTTTTCGCTCAAAACATACCTGGAAATG
PX708502.1 : ----------------GGATAATTCTATTAACCATGAAGGCTATCATTGCTTTGAGCAACATTCTATGTCTTGTTTTCGCTCAAAAAATACCTGGAAATG
PX708462.1 : ----------------GGATAATTCTATTAACCATGAAGGCTATCATTGCTTTGAGCAACATTCTATGTCTTGTTTTCGCTCAAAAAATACCTGGAAATG
PV570299.1 : G-----------CAGGGGATAATTCTATTAACCATGAAGACCATCATTGCCTTGAGCTACACTCTATGTCTTGTTTTCGCTCAAAAAATCCCTGGAAATG
PX734578.1 : ----------------GGATAATTCTATTAACCATGAAGGCTATCATTGCTTTGAGCAACATTCTATGTCTTGTTTTCGCTCAAAAAATACCTGGAAATG
PX734570.1 : ----------------GGATAATTCTATTAACCATGAAGGCTATCATTGCTTTGAGCAACATTCTATGTCTTGTTTTCGCTCAAAAAATACCTGGAAATG
PX708596.1 : ----------------GGATAATTCTATTAACCATGAAGGCTATCATTGCTTTGAGCAACATTCTATGTCTTGTTTTCGCTCAAAAAATACCTGGAAATG
PX708588.1 : ----------------GGATAATTCTATTAACCATGAAGGCTATCATTGCTTTGAGCAACATTCTATGTCTTGTTTTCGCTCAAAAAATACCTGGAAATG
PX708812.1 : ----------------GGATAATTCTATTAACCATGAAGGCTATCATTGCTTTGAGCAACATTCTATGTCTTGTTTTCGCTCAAAAAATACCTGGAAATG
PX734594.1 : ----------------GGATAATTCTATTAACCATGAAGGCTATCATTGCTTTGAGCAACATTCTATGTCTTGTTTTCGCTCAAAACATACCTGGAAATG
PX708612.1 : ----------------GGATAGTTCTATTAACCATGAAGGCTATCATTGCTTTGAGCAACATTCTATGTCTTGTTTTCGCTCAAAAAATACCTGGAAATG
PX708604.1 : ----------------GGATAGTTCTATTAACCATGAAGGCTATCATTGCTTTGAGCAACATTCTATGTCTTGTTTTCGCTCAAAAAATACCTGGAAATG
PX708414.1 : ----------------GGATAGTTCTATTAACCATGAAGGCTATCATTGCTTTGAGCAACATTCTATGTCTTGTTTTCGCTCAAAAAATACCTGGAAATG
PX708406.1 : ----------------GGATAGTTCTATTAACCATGAAGGCTATCATTGCTTTGAGCAACATTCTATGTCTTGTTTTCGCTCAAAAAATACCTGGAAATG
PX708875.1 : ----------------GGATAATTCTATTAACCATGAAGGCTATCATTGCTTTGAGCAACATTCTATGTCTTGTTTTCGCTCAAAAAATACCTGGAAATG
PX708868.1 : ----------------GGATAATTCTATTAACCATGAAGGCTATCATTGCTTTGAGCAACATTCTATGTCTTGTTTTCGCTCAAAAAATACCTGGAAATG
PX734538.1 : ----------------GGATAATTCTATTAACCATGAAGGCTATCATTGCTTTGAGCAACATTCTATGTCTTGTTTTCGCTCAAAACATACCTGGAAATG
PV570315.1 : CCAAGGCAAAAGCAGGGGATAATTCTATTAACCATGAAGGCTATCATTGCTTTGAGCAACATTCTATGTCTTGTTTTCGCTCAAAAAATACCTGGAAATG
PX708644.1 : ----------------GGATAATTCTATTAACCATGAAGGCTATCATTGCTTTGAGCAACATTCTATGTCTTGTTTTCGCTCAAAAAATACCTGGAAATG
PV570274.1 : ---------------------------------ATGAAGACCATCATTGCCTTGAGCTACACTCTATGTCTTGTTTTCGCTCAAAAAATCCCTGGAAATG
PV570291.1 : ---------------------------------ATGAAGACTATCATTGCCTTGAGCTACACTCTATGTCTTGTTTTCGCTCAAAAAATCCCTGGAAATG
PV570275.1 : ---------------------------------------------------------------------CTTGTTTTCGCTCAAAAAATACCTGGAAATG
와 네이버 이걸 안짤랐어… (200개 돌림)
# 엔트로피 점수 도출
def get_top_variable_sites(alignment, top_n=5):
length = alignment.get_alignment_length()
variability = []
for i in range(length):
column = alignment[:, i]
# 가장 많이 등장하는 염기 비율 계산
most_common_ratio = column.count(max(set(column), key=column.count)) / len(column)
variability.append((i, 1 - most_common_ratio))
# 변이율이 높은 순으로 정렬
return sorted(variability, key=lambda x: x[1], reverse=True)[:top_n]
top_sites = get_top_variable_sites(alignment)
print("--- 변이가 집중된 주요 포지션 ---")
for pos, score in top_sites:
print(f"Position {pos}: 변이율 {score*100:.1f}%")
그 엔트로피… 여기서도 함 해봅시다. 한타때도 얘기했지만 바이러스라고 해서 아무데나 막 변이하지 않습니다. 증식에 영향을 안 끼치는 것도 있고 잘못하면 지가 X되는 변이도 있거든…
--- 변이가 집중된 주요 포지션 ---
Position 1734: 변이율 56.5%
Position 16: 변이율 50.0%
Position 17: 변이율 50.0%
Position 18: 변이율 50.0%
Position 19: 변이율 50.0%
백몇개 돌렸을때랑 다른데?
align = AlignIO.read("influenza_h3n2_muscle_aligned.fasta", "fasta")
length = align.get_alignment_length()
n_seqs = len(align)
entropy_list = []
for i in range(length):
col = align[:, i]
# 각 자리의 염기 조성 확인
chars = set(col)
probs = [col.count(c) / n_seqs for c in chars]
# 샤논 엔트로피 계산 (변이의 척도)
entropy = -sum(p * np.log2(p) for p in probs if p > 0)
entropy_list.append(entropy)
plt.figure(figsize=(20, 6))
plt.plot(entropy_list, color='darkblue', linewidth=1)
plt.fill_between(range(length), entropy_list, color='skyblue', alpha=0.4)
plt.title("H3N2 HA Sequence Entropy (Prediction Map)", fontsize=15)
plt.xlabel("Amino Acid Position")
plt.ylabel("Entropy (Mutation Intensity)")
plt.grid(axis='y', linestyle='--', alpha=0.7)
plt.show()

빗이야 뭐야... 쩌기 1750 언저리 보이시죠? 500 좀 가서 하나 더 있고. 저기가 엔트로피가 높다는 얘기는 변이율이 높다는 얘기다. 그니까 슈뢰딩거도 까봐야 안다 이거지.
# 트! 리!
calculator = DistanceCalculator('identity')
dm = calculator.get_distance(alignment)
constructor = DistanceTreeConstructor(calculator, 'nj')
tree = constructor.build_tree(alignment)
terms = tree.get_terminals()
x_limit = max([tree.distance(t) for t in terms])
fig = plt.figure(figsize=(20, 60), dpi=150) # 난 해상도 설정도 될 줄 몰랐고...
ax = fig.add_subplot(1, 1, 1)
for clade in tree.get_terminals():
original_name = str(clade.name)
if '_' in original_name:
parts = original_name.split('_')
clade.name = f"[{parts[0]}] {parts[1]} ({original_name})"
else:
clade.name = original_name
Phylo.draw(tree, axes=ax, do_show=False, label_func=lambda x: "", show_confidence=False)
# 내가 진짜 이것때문에 제미나이랑 급나 씨름했는데 색깔이 안바껴요.
# 이름도 몇번이나 했는데 ID만 줄창떠요. 아오.
for i, node in enumerate(terms):
y_pos = i + 1 # 가지의 y축 위치
x_pos = tree.distance(node) # 가지가 끝나는 x축 위치
orig_name = str(node.name)
# 이름 가공: [연도] 지역 (ID)
if '_' in orig_name:
p = orig_name.split('_')
# 혹시 이미 가공된 이름이라면 중복 방지
display_text = f" ◀ [{p[0]}] {p[1]}" if '[' not in orig_name else f" ◀ {orig_name}"
else:
display_text = f" ◀ {orig_name}"
# 가지 끝(x_pos)에 바로 텍스트를 박습니다.
ax.text(x_pos, y_pos, display_text,
va='center', ha='left',
fontsize=14, color=color,
fontweight='bold' if "LC909067" in orig_name else 'normal')
ax.set_xlim(0, x_limit * 1.8)
ax.set_ylim(0, len(terms) + 2)
ax.set_axis_off() # 축 숫자 빠잉
plt.rc('font', size=14) # 내부 글꼴 사이즈
plt.rc('axes', titlesize=20) # 제모옥은 이 크기로 하겠습니다
plt.title("Influenza A (H3N2) HA Phylogenetic Tree by Region/Year")
plt.tight_layout()
plt.savefig("Influenza_H3N2_Final_Tree.png", dpi=300, bbox_inches='tight')
plt.xlabel("Genetic Distance (Substitutions per site)")
plt.show()
길고 아름다운 트리를 볼 준비가 되셨습니까?

안됐어도 봐야됨.
종으로 필터링 안했을때는 생뚱맞게 개 독감 하나 튀어나와서 저기 어디 지 혼자 떨어져있었음.
# 사람을 공격하는 바이러스만 찾습니다
human_terms = [t for t in tree.get_terminals() if 'canine' not in str(t.name).lower()]
# 유전적 거리순으로 정렬
human_distances = [(tree.distance(t), t.name) for t in human_terms]
human_distances.sort(key=lambda x: x[0], reverse=True)
print("=== 🚨 독감 변종 TOP 5 ===")
print("-" * 70)
print(f"{'순위':<4} | {'ID':<12} | {'변이도':<8} | {'신상 정보'}")
print("-" * 70)
for i, (dist, name) in enumerate(human_distances[:5], 1):
target_id = str(name)
found_info = "정보 없음"
# alignment 데이터에서 상세 지역/연도 정보 매칭
for record in alignment:
if target_id in record.description or target_id in record.id:
full_info = record.description if record.description else record.id
if '_' in full_info:
parts = full_info.split('_')
found_info = f"[{parts[0]}] {parts[1].split(' ')[0]}"
else:
# description에서 연도/지역 추출 시도 (괄호 안 정보 등)
found_info = full_info.split('virus (')[1].split(')')[0] if '(' in full_info else full_info
break
print(f"{i:<5} | {target_id:<12} | {dist:.4f} | {found_info}")
print("-" * 70)
print("※ 변이도가 높을수록 기존 면역 체계를 회피할 가능성이 큽니다.")
=== 🚨 독감 변종 TOP 5 ===
----------------------------------------------------------------------
순위 | ID | 변이도 | 신상 정보
----------------------------------------------------------------------
1 | PV570275.1 | 0.0891 | A/Shanghai/B623090301/2023(H3N2
2 | PQ483286.1 | 0.0630 | [PQ483286.1 Influenza A virus (A/Jinan/JN] 06/2019(H3N2))
3 | OP847955.1 | 0.0575 | A/Izmir/45/2019(H3N2
4 | OP848003.1 | 0.0493 | A/Izmir/7/2020(H3N2
5 | PV570299.1 | 0.0493 | A/Shanghai/B622090701/2022(H3N2
----------------------------------------------------------------------
※ 변이도가 높을수록 기존 면역 체계를 회피할 가능성이 큽니다.
뭐 이거 막 절대적인 척도 이런거 아니고 해마글루티닌 긁어서 봤더니 쟤가 변이도가 제일 높더라 이런겁니다. 올해 독감 백신은 제약사에서 알아서 음 이놈이군 해서 백신 만들거니까 이거 너무 맹신하지 마십쇼.
'Coding > Python' 카테고리의 다른 글
| 폐암 데이터를 분석해서 생존률을 비교해보자 (0) | 2026.01.07 |
|---|---|
| 베이즈 정리 (0) | 2026.01.06 |
| 한타바이러스의 시퀀스를 받아서 MSA를 해보자 (후편) (0) | 2026.01.05 |
| 한타바이러스의 시퀀스를 받아서 MSA를 해보자 (전편) (0) | 2026.01.02 |
| 클린바 가서 VCF 자동으로 받기 시스템 (0) | 2025.12.30 |