Skip to content

Commit b61f3c5

Browse files
authored
Add files via upload
1 parent 3aa48a9 commit b61f3c5

File tree

11 files changed

+1308
-0
lines changed

11 files changed

+1308
-0
lines changed

1A.py

Lines changed: 108 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,108 @@
1+
##Frequent Words Problem
2+
#This is the first problem in a collection of "code challenges" to accompany Bioinformatics Algorithms: An Active-Learning Approach by Phillip Compeau & Pavel Pevzner.
3+
#A k-mer is a string of length k. We define Count(Text, Pattern) as the number of times that a k-mer Pattern appears as a substring of Text. For example,
4+
#Count(ACAACTATGCATACTATCGGGAACTATCCT,ACTAT)=3.
5+
#We note that Count(CGATATATCCATAG, ATA) is equal to 3 (not 2) since we should account for overlapping occurrences of Pattern in Text.
6+
#We say that Pattern is a most frequent k-mer in Text if it maximizes Count(Text, Pattern) among all k-mers. For example, "ACTAT" is a most frequent 5-mer in "ACAACTATGCATCACTATCGGGAACTATCCT", and "ATA" is a most frequent 3-mer of "CGATATATCCATAG".
7+
8+
#Frequent Words Problem
9+
#Find the most frequent k-mers in a string.
10+
#Given: A DNA string Text and an integer k.
11+
#Return: All most frequent k-mers in Text (in any order).
12+
13+
#Sample Dataset
14+
'''
15+
ACGTTGCATGTCGCATGATGCATGAGAGCT
16+
4
17+
'''
18+
#Sample Output
19+
'''
20+
CATG GCAT
21+
'''
22+
##########################################################################################
23+
24+
import os, time
25+
from Bio.Seq import Seq
26+
27+
# start timing
28+
startTime = time.time()
29+
print 'Start'
30+
31+
path = os.path.join('E:\\','gential','Documents','Archivio_Coursesera','Coursera_BioinformaticsAlgorithms(Part1)','Rosalind','Bioinformatics_TextbookTrack')
32+
In_filename = 'input.txt' #'rosalind_1a_2_dataset.txt'
33+
fText_in = os.path.join(path,In_filename)
34+
In_filetext = open(fText_in,'r')
35+
lines=In_filetext.readlines()
36+
In_filetext.close()
37+
Out_filename = 'output.txt' #'rosalind_1a_2_output.txt'
38+
fText_out = os.path.join(path,Out_filename)
39+
Out_filetext = open(fText_out,'w')
40+
41+
values = []
42+
for line in lines:
43+
values += line.split()
44+
DNA_seq = values[0]
45+
k = int(values[1]) #4#
46+
#DNA_seq = 'ACGTTGCATGTCGCATGATGCATGAGAGCT'
47+
print DNA_seq
48+
print k
49+
50+
DNA_seq_list = list(DNA_seq)
51+
#print DNA_seq_list
52+
DNA_lenght = len(DNA_seq)
53+
#print DNA_lenght
54+
55+
sequence_k_selection = ()
56+
sequence_k_list = []
57+
max_N_sequence = 0
58+
k_mers = k - 1
59+
iter = -1
60+
for N_1 in range(DNA_lenght):
61+
iter += 1
62+
k_mers += 1
63+
sequence_k = (DNA_seq)[iter:k_mers]
64+
#print N_1, iter, k_mers, sequence_k
65+
if len(sequence_k) == k:
66+
a = -1
67+
b = k - 1
68+
N_sequence = 0
69+
for N_2 in range(DNA_lenght):
70+
a += 1
71+
b += 1
72+
sequence = (DNA_seq)[a:b]
73+
if sequence_k == sequence:
74+
N_sequence += 1
75+
#print N_sequence, sequence, ' = ', sequence_k
76+
if N_sequence >= max_N_sequence:
77+
max_N_sequence = N_sequence
78+
#print 'max_N_sequence -->', max_N_sequence
79+
80+
if N_sequence >= max_N_sequence:
81+
#sequence_k_selection = (str(N_sequence)+' '+sequence_k)
82+
#print sequence_k_selection
83+
sequence_k_list.insert(0,str(N_sequence))
84+
sequence_k_list.insert(1,sequence_k)
85+
print sequence_k_list
86+
87+
seq_list = []
88+
max_value = int(sequence_k_list[0])
89+
print 'max_value is ',max_value
90+
n = 0
91+
for iter in range(len(sequence_k_list)/2):
92+
N_sequence = int(sequence_k_list[n])
93+
sequence_k = sequence_k_list[n+1]
94+
n+=2
95+
if N_sequence >= max_value:
96+
max_value = N_sequence
97+
if sequence_k not in seq_list:
98+
seq_list += [sequence_k]
99+
print seq_list
100+
for seq in seq_list:
101+
Out_filetext.write(str(seq)+'\n')
102+
Out_filetext.close()
103+
104+
# show elapsed time
105+
endTime = time.time()
106+
print 'Entire Genome elapsed time: ', endTime - startTime, ' seconds'
107+
108+
print 'end'

1B.py

Lines changed: 78 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,78 @@
1+
##In DNA strings, symbols 'A' and 'T' are complements of each other, as are 'C' and 'G'. Given a nucleotide p, we denote its complementary nucleotide as p. The reverse complement of a string Pattern = p1...pn is the string Pattern = pn ... p1 formed by taking the complement of each nucleotide in Pattern, then reversing the resulting string.
2+
#For example, the reverse complement of Pattern = "GTCA" is Pattern = "TGAC".
3+
4+
#Reverse Complement Problem
5+
#Find the reverse complement of a DNA string.
6+
7+
#Given: A DNA string Pattern.
8+
#Return: Pattern, the reverse complement of Pattern.
9+
10+
#Sample Dataset
11+
'''
12+
AAAACCCGGT
13+
'''
14+
#Sample Output
15+
'''
16+
ACCGGGTTTT
17+
'''
18+
##########################################################################################
19+
20+
import os, time
21+
from Bio.Seq import Seq
22+
23+
# start timing
24+
startTime = time.time()
25+
print 'Start'
26+
27+
path = os.path.join('E:\\','gential','Documents','Archivio_Coursesera','Coursera_BioinformaticsAlgorithms(Part1)','Rosalind','Bioinformatics_TextbookTrack')
28+
In_filename = 'rosalind_1b_1_dataset.txt'
29+
fText_in = os.path.join(path,In_filename)
30+
In_filetext = open(fText_in,'r')
31+
lines=In_filetext.readlines()
32+
In_filetext.close()
33+
Out_filename = 'output.txt' #rosalind_1b_1_output
34+
fText_out = os.path.join(path,Out_filename)
35+
Out_filetext = open(fText_out,'w')
36+
37+
for line in lines:
38+
DNA_seq = line
39+
#DNA_seq = 'AAAACCCGGT'
40+
print DNA_seq
41+
DNA_seq_list = list(DNA_seq)
42+
#print DNA_seq_list
43+
DNA_lenght = len(DNA_seq)
44+
'''
45+
# using the Bio.Seq module
46+
DNA_seq = Seq('AAAACCCGGT')
47+
DNA_revc = DNA_seq.reverse_complement()
48+
print DNA_revc
49+
'''
50+
DNA_rev=''
51+
DNA_revc=''
52+
Base=''
53+
i=DNA_lenght
54+
for N in range(DNA_lenght):
55+
DNA_rev += DNA_seq_list[i-1]
56+
i+=-1
57+
print DNA_rev
58+
59+
for Nucleotide in DNA_rev:
60+
if Nucleotide == 'A':
61+
Base = 'T'
62+
if Nucleotide == 'T':
63+
Base = 'A'
64+
if Nucleotide == 'C':
65+
Base = 'G'
66+
if Nucleotide == 'G':
67+
Base = 'C'
68+
DNA_revc += Base
69+
print DNA_revc
70+
71+
Out_filetext.write(str(DNA_revc))
72+
Out_filetext.close()
73+
74+
# show elapsed time
75+
endTime = time.time()
76+
print 'Entire Genome elapsed time: ', endTime - startTime, ' seconds'
77+
78+
print 'end'

1C.py

Lines changed: 71 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,71 @@
1+
##Pattern Matching Problem
2+
3+
#Recall from that different occurrences of a substring can overlap with each other. For example, ATA occurs three times in CGATATATCCATAG.
4+
#Pattern Matching Problem
5+
#Find all occurrences of a pattern in a string.
6+
7+
#Given: Strings Pattern and Genome.
8+
#Return: All starting positions in Genome where Pattern appears as a substring.
9+
10+
#Sample Dataset
11+
'''
12+
ATAT
13+
GATATATGCATATACTT
14+
'''
15+
#Sample Output
16+
'''
17+
1 3 9
18+
'''
19+
##########################################################################################
20+
21+
import os, time
22+
from Bio.Seq import Seq
23+
24+
# start timing
25+
startTime = time.time()
26+
print 'Start'
27+
28+
path = os.path.join('E:\\','gential','Documents','Archivio_Coursesera','Coursera_BioinformaticsAlgorithms(Part1)','Rosalind','Bioinformatics_TextbookTrack')
29+
In_filename = 'rosalind_1c_1_dataset.txt'
30+
fText_in = os.path.join(path,In_filename)
31+
In_filetext = open(fText_in,'r')
32+
lines=In_filetext.readlines()
33+
In_filetext.close()
34+
Out_filename = 'output.txt' #'rosalind_1c_1_output.txt'
35+
fText_out = os.path.join(path,Out_filename)
36+
Out_filetext = open(fText_out,'w')
37+
38+
values = []
39+
for line in lines:
40+
values += line.split()
41+
Pattern = values[0]
42+
DNA_seq = values[1]
43+
#print Pattern
44+
#print DNA_seq
45+
#Pattern = 'ATAT'
46+
#DNA_seq = 'GATATATGCATATACTT'
47+
48+
49+
DNA_seq_list = list(DNA_seq)
50+
DNA_lenght = len(DNA_seq)
51+
52+
k = len(Pattern)
53+
k_mers = k - 1
54+
iter = -1
55+
ciclo = -1
56+
57+
for Nucleotide in DNA_seq_list:
58+
ciclo = ciclo + 1
59+
iter = iter + 1
60+
k_mers = k_mers + 1
61+
sequence = (DNA_seq)[iter:k_mers]
62+
if sequence == Pattern:
63+
print ciclo, ' '#, sequence, ' = ', Pattern
64+
Out_filetext.write(str(ciclo)+'\n')
65+
Out_filetext.close()
66+
67+
# show elapsed time
68+
endTime = time.time()
69+
print 'Entire DNA_seq elapsed time: ', endTime - startTime, ' seconds'
70+
71+
print 'end'

1D.py

Lines changed: 102 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,102 @@
1+
##Clump Finding Problem
2+
3+
#Given integers L and t, a string Pattern forms an (L, t)-clump inside a (larger) string Genome if there is an interval of Genome of length L in which Pattern appears at least t times. For example, TGCA forms a (25,3)-clump in the following Genome: gatcagcataagggtcccTGCAaTGCAtgacaagccTGCAgttgttttac.
4+
#Clump Finding Problem
5+
#Find patterns forming clumps in a string.
6+
7+
#Given: A string Genome, and integers k, L, and t.
8+
#Return: All distinct k-mers forming (L, t)-clumps in Genome.
9+
10+
#Sample Dataset
11+
'''
12+
CGGACTCGACAGATGTGAAGAAATGTGAAGACTGAGTGAAGAGAAGAGGAAACACGACACGACATTGCGACATAATGTACGAATGTAATGTGCCTATGGC
13+
5 75 4
14+
'''
15+
#Sample Output
16+
'''
17+
CGACA GAAGA AATGT
18+
'''
19+
##########################################################################################
20+
21+
22+
import os, time
23+
from Bio.Seq import Seq
24+
25+
# start timing
26+
startTime = time.time()
27+
print 'Start'
28+
29+
path = os.path.join('E:\\','gential','Documents','Archivio_Coursesera','Coursera_BioinformaticsAlgorithms(Part1)','Rosalind','Bioinformatics_TextbookTrack')
30+
In_filename = 'rosalind_1d_2_dataset.txt'
31+
fText_in = os.path.join(path,In_filename)
32+
In_filetext = open(fText_in,'r')
33+
lines=In_filetext.readlines()
34+
In_filetext.close()
35+
Out_filename = 'output.txt' #'rosalind_1d_2_output.txt'
36+
fText_out = os.path.join(path,Out_filename)
37+
Out_filetext = open(fText_out,'w')
38+
39+
values = []
40+
for line in lines:
41+
values += line.split()
42+
DNA_seq = values[0]
43+
k = int(values[1]) #5#
44+
L = int(values[2]) #75#
45+
t = int(values[3]) #4#
46+
#DNA_seq = 'CGGACTCGACAGATGTGAAGAAATGTGAAGACTGAGTGAAGAGAAGAGGAAACACGACACGACATTGCGACATAATGTACGAATGTAATGTGCCTATGGC'
47+
print DNA_seq
48+
print k
49+
print L
50+
print t
51+
52+
DNA_seq_list = list(DNA_seq)
53+
DNA_lenght = len(DNA_seq)
54+
55+
frequency = t
56+
ciclo = -1
57+
58+
L_first = 0
59+
L_last = L
60+
sequence_list = []
61+
for OriC_Bases in range(DNA_lenght):
62+
OriC = (DNA_seq)[L_first:L_last]
63+
L_first = L_first + L
64+
L_last = L_last + L
65+
66+
OriC_Length = len(OriC)
67+
if OriC_Length <= L:
68+
#print OriC
69+
70+
iter = -1
71+
k_mers = k - 1
72+
for Nucleotide in OriC:
73+
iter = iter + 1
74+
k_mers = k_mers + 1
75+
sequence_k = (OriC)[iter:k_mers]
76+
77+
a = -1
78+
b = k - 1
79+
N_sequence = 0
80+
for N_mers in DNA_seq:
81+
a = a + 1
82+
b = b + 1
83+
sequence = (DNA_seq)[a:b]
84+
85+
if sequence == sequence_k:
86+
N_sequence = N_sequence + 1
87+
if N_sequence >= frequency:
88+
#print N_sequence, sequence, ' = ', sequence_k
89+
if sequence not in sequence_list:
90+
sequence_list += [sequence]
91+
#print sequence_list
92+
93+
for sequence in sequence_list:
94+
Out_filetext.write(sequence+'\n')
95+
Out_filetext.close()
96+
97+
# show elapsed time
98+
endTime = time.time()
99+
print 'Entire DNA_seq elapsed time: ', endTime - startTime, ' seconds'
100+
print 'The DNA_seq is composed by a length of ',len(DNA_seq),' nucleotides'
101+
102+
print 'end'

0 commit comments

Comments
 (0)