alegent
diff --git a/‎1A.py
Lines changed: 108 additions & 0 deletions b/‎1A.py
Lines changed: 108 additions & 0 deletions
diff --git a/‎1B.py
Lines changed: 78 additions & 0 deletions b/‎1B.py
Lines changed: 78 additions & 0 deletions
diff --git a/‎1C.py
Lines changed: 71 additions & 0 deletions b/‎1C.py
Lines changed: 71 additions & 0 deletions
diff --git a/‎1D.py
Lines changed: 102 additions & 0 deletions b/‎1D.py
Lines changed: 102 additions & 0 deletions
@@ -0,0 +1,108 @@
+##Frequent Words Problem
+#This is the first problem in a collection of "code challenges" to accompany Bioinformatics Algorithms: An Active-Learning Approach by Phillip Compeau & Pavel Pevzner.
+#A k-mer is a string of length k. We define Count(Text, Pattern) as the number of times that a k-mer Pattern appears as a substring of Text. For example,
+#Count(ACAACTATGCATACTATCGGGAACTATCCT,ACTAT)=3.
+#We note that Count(CGATATATCCATAG, ATA) is equal to 3 (not 2) since we should account for overlapping occurrences of Pattern in Text.
+#We say that Pattern is a most frequent k-mer in Text if it maximizes Count(Text, Pattern) among all k-mers. For example, "ACTAT" is a most frequent 5-mer in "ACAACTATGCATCACTATCGGGAACTATCCT", and "ATA" is a most frequent 3-mer of "CGATATATCCATAG".
+
+#Frequent Words Problem
+#Find the most frequent k-mers in a string.
+#Given: A DNA string Text and an integer k.
+#Return: All most frequent k-mers in Text (in any order).
+
+#Sample Dataset
+'''
+ACGTTGCATGTCGCATGATGCATGAGAGCT
+4
+'''
+#Sample Output
+'''
+CATG GCAT
+'''
+##########################################################################################
+
+import os, time
+from Bio.Seq import Seq
+
+# start timing
+startTime = time.time()
+print 'Start'
+
+path = os.path.join('E:\\','gential','Documents','Archivio_Coursesera','Coursera_BioinformaticsAlgorithms(Part1)','Rosalind','Bioinformatics_TextbookTrack')
+In_filename = 'input.txt'	#'rosalind_1a_2_dataset.txt'
+fText_in = os.path.join(path,In_filename)
+In_filetext = open(fText_in,'r')
+lines=In_filetext.readlines()
+In_filetext.close()
+Out_filename = 'output.txt'	#'rosalind_1a_2_output.txt'
+fText_out = os.path.join(path,Out_filename)
+Out_filetext = open(fText_out,'w')
+
+values = []
+for line in lines:
+	values += line.split()
+DNA_seq = values[0]
+k = int(values[1])	#4#
+#DNA_seq = 'ACGTTGCATGTCGCATGATGCATGAGAGCT'
+print DNA_seq
+print k
+
+DNA_seq_list = list(DNA_seq)
+#print DNA_seq_list
+DNA_lenght = len(DNA_seq)
+#print DNA_lenght
+
+sequence_k_selection = ()
+sequence_k_list = []
+max_N_sequence = 0
+k_mers = k - 1
+iter = -1
+for N_1 in range(DNA_lenght):
+	iter += 1
+	k_mers += 1
+	sequence_k = (DNA_seq)[iter:k_mers]
+	#print N_1, iter, k_mers, sequence_k
+	if len(sequence_k) == k:
+		a = -1
+		b = k - 1
+		N_sequence = 0
+		for N_2 in range(DNA_lenght):
+			a += 1
+			b += 1
+			sequence = (DNA_seq)[a:b]
+			if sequence_k == sequence:
+				N_sequence += 1
+				#print N_sequence, sequence, ' = ', sequence_k
+		if N_sequence >= max_N_sequence:
+			max_N_sequence = N_sequence
+			#print 'max_N_sequence -->', max_N_sequence
+				
+		if N_sequence >= max_N_sequence:
+			#sequence_k_selection = (str(N_sequence)+' '+sequence_k)
+			#print sequence_k_selection
+			sequence_k_list.insert(0,str(N_sequence))
+			sequence_k_list.insert(1,sequence_k)
+print sequence_k_list
+
+seq_list = []
+max_value = int(sequence_k_list[0])
+print 'max_value is ',max_value
+n = 0
+for iter in range(len(sequence_k_list)/2):
+	N_sequence = int(sequence_k_list[n])
+	sequence_k = sequence_k_list[n+1]
+	n+=2
+	if N_sequence >= max_value:
+		max_value = N_sequence
+		if sequence_k not in seq_list:
+			seq_list += [sequence_k]
+print seq_list
+for seq in seq_list:
+	Out_filetext.write(str(seq)+'\n')
+Out_filetext.close()
+
+# show elapsed time
+endTime = time.time()
+print 'Entire Genome elapsed time: ', endTime - startTime, ' seconds'
+
+print 'end'
@@ -0,0 +1,78 @@
+##In DNA strings, symbols 'A' and 'T' are complements of each other, as are 'C' and 'G'. Given a nucleotide p, we denote its complementary nucleotide as p. The reverse complement of a string Pattern = p1...pn is the string Pattern = pn ... p1 formed by taking the complement of each nucleotide in Pattern, then reversing the resulting string.
+#For example, the reverse complement of Pattern = "GTCA" is Pattern = "TGAC".
+
+#Reverse Complement Problem
+#Find the reverse complement of a DNA string.
+
+#Given: A DNA string Pattern.
+#Return: Pattern, the reverse complement of Pattern.
+
+#Sample Dataset
+'''
+AAAACCCGGT
+'''
+#Sample Output
+'''
+ACCGGGTTTT
+'''
+##########################################################################################
+
+import os, time
+from Bio.Seq import Seq
+
+# start timing
+startTime = time.time()
+print 'Start'
+
+path = os.path.join('E:\\','gential','Documents','Archivio_Coursesera','Coursera_BioinformaticsAlgorithms(Part1)','Rosalind','Bioinformatics_TextbookTrack')
+In_filename = 'rosalind_1b_1_dataset.txt'
+fText_in = os.path.join(path,In_filename)
+In_filetext = open(fText_in,'r')
+lines=In_filetext.readlines()
+In_filetext.close()
+Out_filename = 'output.txt'	#rosalind_1b_1_output
+fText_out = os.path.join(path,Out_filename)
+Out_filetext = open(fText_out,'w')
+
+for line in lines:
+	DNA_seq = line
+#DNA_seq = 'AAAACCCGGT'
+print DNA_seq
+DNA_seq_list = list(DNA_seq)
+#print DNA_seq_list
+DNA_lenght = len(DNA_seq)
+'''
+# using the Bio.Seq module
+DNA_seq = Seq('AAAACCCGGT')
+DNA_revc = DNA_seq.reverse_complement()
+print DNA_revc
+'''
+DNA_rev=''
+DNA_revc=''
+Base=''
+i=DNA_lenght
+for N in range(DNA_lenght):
+	DNA_rev += DNA_seq_list[i-1]
+	i+=-1
+print DNA_rev
+
+for Nucleotide in DNA_rev:
+	if Nucleotide == 'A':
+		Base = 'T'
+	if Nucleotide == 'T':
+		Base = 'A'
+	if Nucleotide == 'C':
+		Base = 'G'
+	if Nucleotide == 'G':
+		Base = 'C'
+	DNA_revc += Base
+print DNA_revc
+
+Out_filetext.write(str(DNA_revc))
+Out_filetext.close()
+
+# show elapsed time
+endTime = time.time()
+print 'Entire Genome elapsed time: ', endTime - startTime, ' seconds'
+
+print 'end'
@@ -0,0 +1,71 @@
+##Pattern Matching Problem
+
+#Recall from that different occurrences of a substring can overlap with each other. For example, ATA occurs three times in CGATATATCCATAG.
+#Pattern Matching Problem
+#Find all occurrences of a pattern in a string.
+
+#Given: Strings Pattern and Genome.
+#Return: All starting positions  in Genome where Pattern appears as a substring.
+
+#Sample Dataset
+'''
+ATAT
+GATATATGCATATACTT
+'''
+#Sample Output
+'''
+1 3 9
+'''
+##########################################################################################
+
+import os, time
+from Bio.Seq import Seq
+
+# start timing
+startTime = time.time()
+print 'Start'
+
+path = os.path.join('E:\\','gential','Documents','Archivio_Coursesera','Coursera_BioinformaticsAlgorithms(Part1)','Rosalind','Bioinformatics_TextbookTrack')
+In_filename = 'rosalind_1c_1_dataset.txt'
+fText_in = os.path.join(path,In_filename)
+In_filetext = open(fText_in,'r')
+lines=In_filetext.readlines()
+In_filetext.close()
+Out_filename = 'output.txt'						#'rosalind_1c_1_output.txt'
+fText_out = os.path.join(path,Out_filename)
+Out_filetext = open(fText_out,'w')
+
+values = []
+for line in lines:
+	values += line.split()
+Pattern = values[0]
+DNA_seq = values[1]
+#print Pattern
+#print DNA_seq
+#Pattern = 'ATAT'
+#DNA_seq = 'GATATATGCATATACTT'
+
+
+DNA_seq_list = list(DNA_seq)
+DNA_lenght = len(DNA_seq)
+
+k = len(Pattern)
+k_mers = k - 1
+iter = -1
+ciclo = -1
+
+for Nucleotide in DNA_seq_list:
+	ciclo = ciclo + 1
+	iter = iter + 1
+	k_mers = k_mers + 1
+	sequence = (DNA_seq)[iter:k_mers]
+	if sequence == Pattern:
+		print ciclo, ' '#, sequence, ' = ', Pattern
+		Out_filetext.write(str(ciclo)+'\n')
+Out_filetext.close()
+
+# show elapsed time
+endTime = time.time()
+print 'Entire DNA_seq elapsed time: ', endTime - startTime, ' seconds'
+
+print 'end'
@@ -0,0 +1,102 @@
+##Clump Finding Problem
+
+#Given integers L and t, a string Pattern forms an (L, t)-clump inside a (larger) string Genome if there is an interval of Genome of length L in which Pattern appears at least t times. For example, TGCA forms a (25,3)-clump in the following Genome: gatcagcataagggtcccTGCAaTGCAtgacaagccTGCAgttgttttac.
+#Clump Finding Problem
+#Find patterns forming clumps in a string.
+
+#Given: A string Genome, and integers k, L, and t.
+#Return: All distinct k-mers forming (L, t)-clumps in Genome.
+
+#Sample Dataset
+'''
+CGGACTCGACAGATGTGAAGAAATGTGAAGACTGAGTGAAGAGAAGAGGAAACACGACACGACATTGCGACATAATGTACGAATGTAATGTGCCTATGGC
+5 75 4
+'''
+#Sample Output
+'''
+CGACA GAAGA AATGT
+'''
+##########################################################################################
+
+
+import os, time
+from Bio.Seq import Seq
+
+# start timing
+startTime = time.time()
+print 'Start'
+
+path = os.path.join('E:\\','gential','Documents','Archivio_Coursesera','Coursera_BioinformaticsAlgorithms(Part1)','Rosalind','Bioinformatics_TextbookTrack')
+In_filename = 'rosalind_1d_2_dataset.txt'
+fText_in = os.path.join(path,In_filename)
+In_filetext = open(fText_in,'r')
+lines=In_filetext.readlines()
+In_filetext.close()
+Out_filename = 'output.txt'					#'rosalind_1d_2_output.txt'
+fText_out = os.path.join(path,Out_filename)
+Out_filetext = open(fText_out,'w')
+
+values = []
+for line in lines:
+	values += line.split()
+DNA_seq = values[0]
+k = int(values[1])	#5#
+L = int(values[2])	#75#
+t = int(values[3])	#4#
+#DNA_seq = 'CGGACTCGACAGATGTGAAGAAATGTGAAGACTGAGTGAAGAGAAGAGGAAACACGACACGACATTGCGACATAATGTACGAATGTAATGTGCCTATGGC'
+print DNA_seq
+print k
+print L
+print t
+
+DNA_seq_list = list(DNA_seq)
+DNA_lenght = len(DNA_seq)
+
+frequency = t
+ciclo = -1
+
+L_first = 0
+L_last = L
+sequence_list = []
+for OriC_Bases in range(DNA_lenght):
+	OriC = (DNA_seq)[L_first:L_last]
+	L_first = L_first + L
+	L_last = L_last + L
+	
+	OriC_Length = len(OriC)
+	if  OriC_Length <= L:
+		#print OriC
+		
+		iter = -1
+		k_mers = k - 1
+		for Nucleotide in OriC:
+			iter = iter + 1
+			k_mers = k_mers + 1
+			sequence_k = (OriC)[iter:k_mers]
+			
+			a = -1
+			b = k - 1
+			N_sequence = 0
+			for N_mers in DNA_seq:
+				a = a + 1
+				b = b + 1
+				sequence = (DNA_seq)[a:b]
+				
+				if sequence == sequence_k:
+					N_sequence = N_sequence + 1
+					if N_sequence >= frequency:
+						#print N_sequence, sequence, ' = ', sequence_k
+						if sequence not in sequence_list:
+							sequence_list += [sequence]
+							#print sequence_list
+
+for sequence in sequence_list: 
+	Out_filetext.write(sequence+'\n')
+Out_filetext.close()
+
+# show elapsed time
+endTime = time.time()
+print 'Entire DNA_seq elapsed time: ', endTime - startTime, ' seconds'
+print 'The DNA_seq is composed by a length of ',len(DNA_seq),' nucleotides'
+
+print 'end'