GNBF5010 Homework 3¶
Student id: 1155228903
Question1¶
(Calculate the proportion of SNPs that are transitions, as well as the number of SNPs in each region)
In [6]:
import pandas as pd
# Class for SNP
class SNP:
def __init__(self, chrname, pos, snpid, ref_allele, alt_allele):
self.chrname = chrname
self.pos = pos
self.snpid = snpid
self.ref_allele = ref_allele
self.alt_allele = alt_allele
# Check if SNP is Transition
def is_transition(self):
ref_alt = (self.ref_allele + self.alt_allele).upper()
return ref_alt in ["AG", "GA", "CT", "TC"]
# Check if SNP is Transversion
def is_transversion(self):
return not self.is_transition()
# Class for Chromosome
class Chromosome:
def __init__(self, chrname):
self.chrname = chrname
self.locations_to_snps = dict()
# Add SNP to Chromosome
def add_snp(self, snp):
self.locations_to_snps[snp.pos] = snp
# Count Transitions
def count_transitions(self):
count = 0
for snp in self.locations_to_snps.values():
if snp.is_transition():
count += 1
return count
# Count Transversions
def count_transversions(self):
return len(self.locations_to_snps) - self.count_transitions()
# Count SNPs in Region (total and transitions)
def snps_in_region(self, region_start, region_end):
total = 0
transitions = 0
for location in self.locations_to_snps:
if region_start <= location <= region_end:
total += 1
if self.locations_to_snps[location].is_transition():
transitions += 1
return total, transitions
def main():
# Read in VCF File
chrnames_to_chrs = dict()
file_name = "trio.sample.vcf" # Change this to the name of the VCF file
with open(file_name, "r") as fh:
for line in fh:
if not line.startswith("#"):
chrname, pos, snpid, ref, alt = line.strip().split("\t")[:5]
pos = int(pos)
if chrname not in chrnames_to_chrs:
chrnames_to_chrs[chrname] = Chromosome(chrname)
if ref != alt:
newsnp = SNP(chrname, pos, snpid, ref, alt)
chrnames_to_chrs[chrname].add_snp(newsnp)
# Save Results to File
with open("transitions.txt", "w") as out_file:
out_file.write("chromosome\tregion\tpercent_transitions\tnum_snps\n")
region_size = 1000000
for chrname in chrnames_to_chrs:
chr_obj = chrnames_to_chrs[chrname]
last_snp_position = max(chr_obj.locations_to_snps.keys())
region_start = 1
while region_start <= last_snp_position:
region_end = region_start + region_size - 1
num_snps, num_transitions = chr_obj.snps_in_region(region_start, region_end)
if num_snps > 0:
percent_transitions = num_transitions / num_snps
else:
percent_transitions = 0
out_file.write(f"{chrname}\t{region_start}..{region_end}\t{percent_transitions:.3f}\t{num_snps}\n")
region_start += region_size
if __name__ == "__main__":
main()
# Display Results
df = pd.read_csv("transitions.txt", sep="\t")
display(df)
chromosome | region | percent_transitions | num_snps | |
---|---|---|---|---|
0 | 1 | 1..1000000 | 0.722 | 18 |
1 | 1 | 1000001..2000000 | 0.794 | 34 |
2 | 1 | 2000001..3000000 | 0.681 | 72 |
3 | 1 | 3000001..4000000 | 0.662 | 65 |
4 | 1 | 4000001..5000000 | 0.747 | 83 |
... | ... | ... | ... | ... |
3028 | X | 150000001..151000000 | 0.771 | 35 |
3029 | X | 151000001..152000000 | 0.737 | 38 |
3030 | X | 152000001..153000000 | 0.676 | 37 |
3031 | X | 153000001..154000000 | 0.750 | 4 |
3032 | X | 154000001..155000000 | 0.533 | 15 |
3033 rows × 4 columns
Question2¶
(Simulate the evolutionary process)
In [7]:
import random
# Class for Bug
class Bug:
def __init__(self):
self.genome = ''.join(random.choice("ACGT") for _ in range(100)) # Randomly generate genome
self.fitness = self.get_fitness()
# Calculate fitness of Bug
def get_fitness(self):
num_Gs = self.genome.count('G')
num_Cs = self.genome.count('C')
if 'AAAAA' in self.genome:
return num_Gs + num_Cs + 5
else:
return num_Gs + num_Cs
# Mutate a random base in the genome
def mutate_random_base(self):
index = random.randint(0, 99)
new_base = random.choice("ACGT")
self.genome = self.genome[:index] + new_base + self.genome[index+1:]
self.fitness = self.get_fitness()
# Set a base at a specific index in the genome
def set_base(self, index, base):
if 0 <= index < 100 and base in "ACGT":
self.genome = self.genome[:index] + base + self.genome[index+1:]
self.fitness = self.get_fitness()
# Compare Bugs based on fitness (facilitate the sorting of the bug list)
def __lt__(self, other):
return self.fitness < other.fitness
# Class for Population
class Population:
def __init__(self):
self.bug_list = [Bug() for _ in range(50)] # Create a population of 50 bugs
# Create offspring by mutating random base in genome
def create_offspring(self):
new_pop = []
for bug in self.bug_list:
newbug = Bug()
newbug.genome = bug.genome
newbug.mutate_random_base()
new_pop.append(bug)
new_pop.append(newbug)
self.bug_list = new_pop
# Reduces self.bug_list to the top 50% of bug objects by fitness
def cull(self):
self.bug_list.sort()
self.bug_list = self.bug_list[len(self.bug_list)//2:]
# Get mean fitness of population
def get_mean_fitness(self):
return sum(bug.fitness for bug in self.bug_list) / len(self.bug_list)
def main():
p = Population()
print("Fitness of population during evolutionary progress")
print("(Assuming fitness = num_Gs + num_Cs (+ 5, if AAAAA present in genome)):")
# Run for 20 generations
for _ in range(20):
p.create_offspring()
p.cull()
print(p.get_mean_fitness())
if __name__ == "__main__":
main()
Fitness of population during evolutionary progress (Assuming fitness = num_Gs + num_Cs (+ 5, if AAAAA present in genome)): 53.8 56.32 57.48 58.48 59.34 60.18 60.58 61.18 61.64 62.26 62.56 63.1 63.42 63.8 64.26 64.66 65.24 65.56 65.98 66.3
Question3¶
(Regular Expressions)
- Match a word that contains 4 or more consecutive vowels (a, e, i, o, u).
- Regular Expression for pattern 1:
\b\w*([aeiou]{4,})\w*\b
- The code for testing is below:
- Regular Expression for pattern 1:
In [8]:
import re
pattern1 = r'\b\w*([aeiou]{4,})\w*\b'
test_strings1 = ["hello", "aeie", "daoioaidas", "aebee", "auu"]
for test in test_strings1:
if re.search(pattern1, test):
print(f"> '{test}' matches pattern 1")
else:
print(f"> '{test}' does not match pattern 1")
> 'hello' does not match pattern 1 > 'aeie' matches pattern 1 > 'daoioaidas' matches pattern 1 > 'aebee' does not match pattern 1 > 'auu' does not match pattern 1
- Match a string with at least 12 characters with “z” as the last one.
- Regular Expression for pattern 2:
.{11}z$
- The code for testing is below:
- Regular Expression for pattern 2:
In [9]:
import re
pattern2 = r'.{11}z$'
test_strings2 = ["hello", "thisisaz", "thisstringhasmorethan11charactersandendswithz", "sjkfbajkhnfjuvaehjfbjkasbhjfvhjkawbgfjksbkjcbxsjkgfhkjagbjkz"]
for test in test_strings2:
if re.search(pattern2, test):
print(f"> '{test}' matches pattern 2")
else:
print(f"> '{test}' does not match pattern 2")
> 'hello' does not match pattern 2 > 'thisisaz' does not match pattern 2 > 'thisstringhasmorethan11charactersandendswithz' matches pattern 2 > 'sjkfbajkhnfjuvaehjfbjkasbhjfvhjkawbgfjksbkjcbxsjkgfhkjagbjkz' matches pattern 2
- Match a number that is only composed of even digits, including 0. But don't allow 0 to be the first digit.
- Regular Expression for pattern 3:
^(?!0)[02468][02468]*$
- The code for testing is below:
- Regular Expression for pattern 3:
In [10]:
import re
pattern3 = r'^(?!0)[02468][02468]*$'
test_strings3 = ["248", "4200", "6", "0", "020", "5", "123"]
for test in test_strings3:
if re.search(pattern3, test):
print(f"> '{test}' matches pattern 3")
else:
print(f"> '{test}' does not match pattern 3")
> '248' matches pattern 3 > '4200' matches pattern 3 > '6' matches pattern 3 > '0' does not match pattern 3 > '020' does not match pattern 3 > '5' does not match pattern 3 > '123' does not match pattern 3
- Match an RNA sequence that begins with "AUG" and ends with either of "UAA","UAG", or "UGA".
- Regular Expression for pattern 4:
AUG[ACGU]*(?:UAA|UAG|UGA)$
- The code for testing is below:
- Regular Expression for pattern 4:
In [11]:
import re
pattern4 = r'AUG[ACGU]*(?:UAA|UAG|UGA)$'
test_strings4 = ["AUGUUUUUAA", "AUGUUUUUGA", "AUGUUUUUAG", "AUGUUUUUUU", "AUGUUTUUAA"]
for test in test_strings4:
if re.search(pattern4, test):
print(f"> '{test}' matches pattern 4")
else:
print(f"> '{test}' does not match pattern 4")
> 'AUGUUUUUAA' matches pattern 4 > 'AUGUUUUUGA' matches pattern 4 > 'AUGUUUUUAG' matches pattern 4 > 'AUGUUUUUUU' does not match pattern 4 > 'AUGUUTUUAA' does not match pattern 4