#!/usr/bin/env python
# Find ORFs in DNA.
# Put markup around them.
from fileinput import input
import string
import sys
import re
#codingRegion = re.compile( r'(?<=TATA.{1,10})ATG(...)*(TAG|TGA|TTA)' )
# sre_constants.error: look-behind requires fixed-width pattern
#codingRegion = re.compile( r'(?<=TATA...)ATG(...)*(TAG|TGA|TTA)' )
# includes stop codons in the result because matches longest match.
#codingRegion = re.compile( r'(?<=TATA...)ATG(...)*?(TAG|TGA|TTA)' )
_orfPattern = r'ATG(...)*?(TAG|TGA|TTA)'
_orf = re.compile( _orfPattern )
#codingRegion = re.compile( r'TATA.{1,30}(ATG(...)*?(TAG|TGA|TTA))' )
_codingRegion = re.compile( r'TATA.{1,30}(%s)' % _orfPattern )
def findOrfs( theSeq ):
"""Return a list of the ORFs that might be coding regions."""
answer = []
m = _codingRegion.search( theSeq )
while ( m ):
answer.append( ( m.start(), m.end()-3 ) )
m = _codingRegion.search( theSeq, m.start()+3 )
return answer
def findCodingRegions( theSeq ):
"""Return a list of the ORFs that might be coding regions."""
answer = []
mStart = 3000
m = _codingRegion.search( theSeq, mStart )
while ( m ):
answer.append( m.span( 1 ) )
mStart = m.start() + 3
m = _codingRegion.search( theSeq, mStart )
return answer
if __name__ == '__main__':
for line in input():
a = findCodingRegions( line )
marker = 0;
for p in a:
print line[ marker: p[0] ]
marker = p[0] + 3
print '%s%s%s' \
% ( line[ p[0]: marker ], line[ marker: p[1]-3 ], line[ p[1]-3: p[1] ] )
marker = p[1]
print line[ marker: ]