''' Jon Beck -- Two routines to use to read a fasta file ''' ''' parseHeader - split out the label from the header line Parameter: a string starting with ">" and ending without a newline Return: the first item in the string, after the ">", up to the first space ''' def parseHeaderLine(line): header = line[1:] label = line[1:].split(' ')[0] return label ''' readfasta - the subroutine that reads the fasta file Parameter: a filename that must be in fasta format. The file is assumed to have: 1. arbitrary blank lines, but every line (especially including the last) is terminated by a line terminator (carriage return) 2. no line has only spaces on it 3. a header line as the first line Return: a list of lists. Each inner list will have three elements: 1. the sequence identifier, the characters between the leading ">" and the first space 2. the entire header, the entire first line not including the ">" 3. the sequence, a single string of all the letters with no line terminators ''' def readfasta(filename): resultList = [] infile = open(filename, 'r') # process the first line, which must be a header line line = infile.readline() headerLine = line.rstrip() label = parseHeaderLine(headerLine) # initialize the sequence accumulator sequence = '' # process all the rest of the lines in the file for line in infile: line = line.rstrip() # ignore blank lines if line == '': continue # if it's a header line, finish the previous sequence # and start a new one if line[0] == '>': resultList.append([label, headerLine, sequence]) label = parseHeaderLine(line) sequence = '' # if we're here, we must be in letters of the sequence else: sequence += line # we're done, so clean up, terminate the last sequence, and return infile.close() resultList.append([label, headerLine, sequence]) return resultList