#!/usr/bin/env python
# Find ORFs in DNA.
# Put markup around them.
from fileinput import input
import string
import sys
import re
_orfPattern = r'ATG(...)*?(TAG|TGA|TTA)'
_orf = re.compile( _orfPattern )
def findOrfs( theSeq ):
"""Return a list of the ORFs.
>>> findOrfs( "" )
[]
>>> findOrfs( "AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA" )
[]
>>> findOrfs( "AAATGAATAATAATGATTGAGGGG" )
[(2, 17)]
>>> findOrfs( "AAATGAATAATAATGATTGAGGGATGTAG" )
[(2, 17), (23, 26)]
"""
answer = []
m = _orf.search( theSeq )
while ( m ):
answer.append( ( m.start(), m.end()-3 ) )
m = _orf.search( theSeq, m.start()+3 )
return answer
def markOrfs( theSeq ):
"""Mark up sequence data with ORF data.
>>> markOrfs( "AAATGAATAATAATGATTGAGGG" )
'AAATGAATAATAATGATTGAGGG'
"""
a = findOrfs( theSeq )
answer = ""
marker = 0;
for p in a:
if p[0] < marker: continue
answer += theSeq[ marker: p[0] ]
marker = p[0] + 3
answer += '%s%s%s' \
% ( theSeq[ p[0]: marker ], theSeq[ marker: p[1] ], theSeq[ p[1]: p[1]+3 ] )
marker = p[1] + 3
answer += theSeq[ marker: ]
return answer
#_codingRegion = re.compile( r'(?<=TATA.{1,10})ATG(...)*(TAG|TGA|TTA)' )
# sre_constants.error: look-behind requires fixed-width pattern
_codingRegion = re.compile( r'TATA.{1,30}(%s)' % _orfPattern )
def findCodingRegions( theSeq ):
"""Return a list of the ORFs that might be coding regions.
>>> src = '''
... GACTATATCTTATAAACCAATGGTACCAAAACTGGAGCGTCGATGTCCGGAAATTGATAT
... TCTGTTCCTGACCGGGAGCTCCGCCGGAAGATTTTCACCGTATACTCGGGCACCAGGAGA
... TCTGCATGGAAGAGCGATGTGTCGACCGTTCCGTGGCATAACTCCGCCTAGCGGCAAAGC
... GAAGTTGTTTGACAGGTGATGAAAAAAACAAAAAATGTCGAACGAGTAGCGTGCATAATG
... AGTGCGTCAATGCTGTATTTCTGATCGTGGCGTGAATACGCGTGGTACGCCGCAAAAGCC
... TTATGTGCAGGAGCACTGGCGGCATCTTCTCTGTCATTTTGCCTTCAAAACG
... '''
>>> src = src.replace('\\n', '')
>>> findCodingRegions( src )
[(19, 199), (42, 57), (125, 275)]
"""
answer = []
mStart = 0
m = _codingRegion.search( theSeq, mStart )
while ( m ):
answer.append( m.span( 1 ) )
mStart = m.start() + 3
m = _codingRegion.search( theSeq, mStart )
return answer
def markCodingRegions( theSeq ):
"""Mark up sequence data with coding region data.
123456789 123456789 123456789 123456789 123456789 123456789
>>> src = '''
... GACTATATCTTATAAACCAATGGTACCAAAACTGGAGCGTCGATGTCCGGAAATTGATAT
... TCTGTTCCTGACCGGGAGCTCCGCCGGAAGATTTTCACCGTATACTCGGGCACCAGGAGA
... TCTGCATGGAAGAGCGATGTGTCGACCGTTCCGTGGCATAACTCCGCCTAGCGGCAAAGC
... GAAGTTGTTTGACAGGTGATGAAAAAAACAAAAAATGTCGAACGAGTAGCGTGCATAATG
... AGTGCGTCAATGCTGTATTTCTGATCGTGGCGTGAATACGCGTGGTACGCCGCAAAAGCC
... TTATGTGCAGGAGCACTGGCGGCATCTTCTCTGTCATTTTGCCTTCAAAACG
... '''
>>> src = src.replace('\\n', '')
>>> html = markCodingRegions( src )
>>> html[:7]
'GACTATA'
>>> html[-9:]
'TTCAAAACG'
>>> len( html )
463
>>> html[18:96]
'AATGG'
>>> html[266:313]
'AGGTGATGA'
>>> html[314:].index( '<' )
Traceback (most recent call last):
...
ValueError: substring not found
"""
a = findCodingRegions( theSeq )
answer = ""
marker = 0;
for p in a:
if p[0] < marker: continue
answer += theSeq[ marker: p[0] ]
marker = p[0] + 3
answer += '%s%s%s' \
% ( theSeq[ p[0]: marker ], theSeq[ marker: p[1]-3 ], theSeq[ p[1]-3: p[1] ] )
marker = p[1]
answer += theSeq[ marker: ]
return answer
def _markEm( theInput ):
for line in theInput:
print markCodingRegions( line )
def _testdoc():
import doctest
doctest.testmod()
if __name__ == '__main__':
if 1 < len( sys.argv ):
if "doctest" == sys.argv[1]:
_testdoc()
else:
try:
f = open( sys.argv[1] )
except IOError, e:
print e
else:
_markEm( f )
f.close()
else:
_markEm( input() )