from genestruct import Chromosome from biogeneutil import num_short, find_consecutives, get_gene_intersect, consec_list import scraper, sets ############ ## Functions ############ def load_chrcontig(fpath, chrlist=[]): """loads a GRIMM-compatible file, and returns a Chromosome list of the chromosomes in that file print fpath -- string filepath of the file returns a Chromosome list""" grimmf = open(fpath, 'r') prev_line = "" for line in grimmf: line = line.split() try: if int(line[0]): del line[-1] newchr = [int(x) for x in line] prev_line = prev_line[2] chr = Chromosome(newchr, prev_line) chrlist.append(chr) except ValueError: pass except IndexError: pass prev_line = line return chrlist def get_relevant_actions(genlist, actionlist): """returns a list of step/action tuples of GRIMMActions where the argument genes appear genlist -- list of genes, in integer form actionlist -- list of GRIMMActions returns step/action tuple list""" relevants = [] for f in actionlist: if check_union(f, genlist): relevants.append(f) return relevants def check_union(x_action, y_genlist): """checks to see if a GRIMMAction contains any appearance of any gene in a gene list x_action -- GRIMMAction to examine y_genlist -- list of genes to search for in x_action returns True if any genes in y_genlist appear in x_action, False otherwise""" actiongenes = [] for x in x_action.get_before(): actiongenes.extend(x) for y in x_action.get_after(): actiongenes.extend(y) s = sets.Set(actiongenes) & sets.Set(y_genlist) if len(s) > 0: return True return False def open_grimm(fpath, tgtchr): """method that retrieves the genes from a file from a particular chromosome/contig fpath -- string filepath of a GRIMM-format file tgtchar -- string name of the chromosome/contig to locate (ex: '01_01') returns an integer list of genes""" grimmf = open(fpath, 'r') curline = grimmf.readline() while curline != "": if curline.find(tgtchr) >= 0: tgtline = grimmf.readline() if tgtline != None: tgtline = tgtline.split() del tgtline[-1] # remove the '$' character intline = [] for x in tgtline: intline.append(int(x)) return intline break curline = grimmf.readline() def locate(fpath, tgtchr): """method that produces a list of chromosome/contig names where a set of genes can be found fpath -- string filepath to a GRIMM-format file to use as the search area tgtchr -- list of genes to locate returns list of chromosome/contig names with any gene in tgtlst""" grimmf = open(fpath, 'r') loclist = [] curline = grimmf.readline().split() while curline != []: try: head = curline[1] if head == 'chr_contig': # found a chr_contig curgenes = grimmf.readline().split() # get the chr_contig genes del curgenes[-1] # remove the '$' character tlist = [[abs(int(x)) == abs(int(y)) for x in tgtchr] for y in curgenes] slist = [x.count(True) for x in tlist] if slist.count(1) > 0: loclist.append(curline[2]) except IndexError: pass curline = grimmf.readline().split() return loclist ########### ## Classes ########### # genome rearrangement network class class RearrangementNetwork: def __init__(self, actionlist, src_genome, tgt_genome): """RearrangementNetwork constructor actionlist -- list of GRIMMActions representing the rearrangements in a scenario src_genome -- list of Chromosomes representing the source genome (for labelling) tgt_genome -- list of Chromosomes representing the target genome (for labelling)""" # self.networklist = [] # list of RearrangementWrappers self.networklist = [RearrangementWrapper(y, src_genome, tgt_genome) for y in actionlist] def __len__(self): """handler for the len() method""" return len(self.networklist) def __repr__(self): """representation method""" rep = "" for v in self.networklist: rep += "*" + str(v.get_grimmaction()) + "* \n" + str(v) return rep def trace_chromosome(self, chrname): """returns a list of RearrangementWrappers where there is an action involving any chromosome/contig with the name 'chrnam' chrname -- string name to search for""" return [t for t in self.networklist if t.has_chrcontig(chrname)] def trace_gene_active(self, genenum): """returns the list of RearrangementWrappers where there is an action involving any chromsome/contig with the gene genenum, in the altered section genenum -- gene number to search for""" return [t for t in self.networklist if t.has_gene_active(genenum)] def trace_intersect_chromosome(self, chrname1, chrname2): """returns the list of Rearrangement Wrappers where there is an action involving any chromosome/contig chrname1 AND chrname2 chrname1 -- first chr/contig to search for chrname2 -- second chr/contig to search for""" return [t for t in self.networklist if t.has_chrcontig(chrname1) and t.has_chrcontig(chrname2)] # getters def __getitem__(self, index): return self.networklist[index] # genome rearrangement wrapper class class RearrangementWrapper: def __init__(self, grimm_action, src_genome, tgt_genome): """RearrangementWrapper constructor; annotates the sequences in the GRIMMAction by chromosomes grimm_action -- GRIMMAction to convert into a member of a RearrangementNetwork src_genome -- list of Chromosomes, some of which may represent genes within the grimm_action tgt_genome -- list of Chromosomes, some of which may represent genes within the grimm_action""" self.action = grimm_action self.annotationlist = [] # list of annotations self.gene_dictionary = {} # dictionary of annotations (somewhat inverse of annotationlist) self._populate_dict(src_genome) # populate source genome into the dictionary self._populate_dict(tgt_genome) # populate the target genome into the dictionary self.annotationlist.extend(self._accumulate_genome(grimm_action, src_genome)) # get relevant chromosomes from source genome self.annotationlist.extend(self._accumulate_genome(grimm_action, tgt_genome)) # get relevant chromosomes from target genome def __repr__(self): """representation method""" rep = "" for v in self.annotationlist: rep += v.get_chrcontig() + ": " + str(v.get_chromosome()) + "\n" return rep __str__ = __repr__ # string is same as representation def _populate_dict(self, genome): """populates the annotation dictionary genome -- genome to populate the dictionary with""" for x in genome: value = x.get_chrcontig() # get the value (chromosome/contig name) keys = x.get_chromosome() # get the list of keys (genesets) for y in keys: try: orig = self.gene_dictionary[abs(y)] if not orig == value: self.gene_dictionary[abs(y)] = (orig, value) except KeyError: self.gene_dictionary[abs(y)] = value def _accumulate_genome(self, grimmact, genome): """accumulates relevant Chromosome objects into the accumulator for GRIMMAction grimmact for the target genome (list of Chromosomes) grimmact -- GRIMMAction to cross-check genome -- list of Chromosomes to accumulate against accumulator -- list to accumulate to""" accumulator = [] for x in genome: before_acum = self._accumulate_state(grimmact.get_before(), x) after_acum = self._accumulate_state(grimmact.get_after(), x) if before_acum == after_acum and len(before_acum) > 0: accumulator.append(Chromosome(before_acum, x.get_chrcontig())) continue if len(before_acum) > 0: accumulator.append(Chromosome(before_acum, x.get_chrcontig())) if len(after_acum) > 0: accumulator.append(Chromosome(after_acum, x.get_chrcontig())) return accumulator def _accumulate_state(self, statelist, chr): """Accumulates relevant Chromosome objects into the accumulator for all the genes in the list of gene lists statelist statelist -- list of gene lists (lists within a list) representing either the before or after state of a GRIMMAction chr -- Chromosome to accumulate against returns a list of genes that appear in both statelist and the Chromosome accumulator -- list to accumulate to""" accumulator = [] for x in statelist: accumulator.extend(get_gene_intersect(x, chr.get_chromosome())) return accumulator def has_gene_active(self, gennum): """method to determine if this RearrangementWrapper contains gennum gennum -- gene number to search for returns True if self contains that gene, False otherwise""" for x in self.action.get_after_altered(): if gennum in x or -gennum in x: return True for x in self.action.get_before_altered(): if gennum in x or -gennum in x: return True return False def has_chrcontig(self, chtgt): """method to determine if this RearrangementWrapper contains any genes from the chr/contig chtgt chtgt -- string value of the chr/contig to search for returns True if self contains that chtgt, False otherwise""" for i in self.annotationlist: if i.get_chrcontig().find(chtgt) >= 0: return True return False # class getters def get_annotations(self): return self.annotationlist def get_grimmaction(self): return self.action def get_dictionary(self): return self.gene_dictionary def __getitem__(self, index): return self.annotationlist[index] # parser class, inherets from html scraper class GRIMMScraper(scraper.Scraper): def __init__(self, maxgen): """GRIMMScraper constructor maxgen -- upperlimit+1 of valid gene numbers""" scraper.Scraper.__init__(self) self.max, self.prev_line, self.cmarker_count = maxgen, "", 0 self.action_found = False self.after = False self.in_altered = False self.current_action = GRIMMAction() self.grimm_actions = [] self.cur_container = [] self.cur_altered = [] def pdata(self, inchunk): """overrides handling of data between tags - just calls object _parser_handler method inchunk -- data inbetween tags""" self._parser_handler(inchunk) def handletag(self, name, attrs, thetag): """Called when we encounter a tag. Is passed the tag name and a list of (attrname, attrvalue) - and the original tag contents as a string.""" if thetag.find("b class=\"hl\"") >= 0: self.in_altered = True return '<' + thetag + '>' def endtag(self, thetag): """Called when we encounter a close tag. = 0 and self.in_altered == True: self.in_altered = False return '<' + thetag + '>' def _parser_handler(self, inchunk): """method that handles the data between tags inchunk -- string value inbetween tags""" inchunk = inchunk.rstrip().lower() if inchunk != '\n' and len(inchunk) > 0: if self._check_action(inchunk) and self.prev_line.isdigit() and \ not self.action_found: # encountered an action self.action_found = True self.after = False if(self.current_action.get_action() != ""): self.grimm_actions.append(self.current_action) self.current_action = GRIMMAction() self.current_action.add_action(inchunk) self.current_action.add_step(self.prev_line) elif self.action_found == True: possiblenums = inchunk.split() for g in possiblenums: try: if not self._check_ends(int(g)): # is a num if self.after: self.cur_container.append(int(g)) else: self.cur_container.append(int(g)) if self.in_altered: # parsing through genes where an action takes place (highlighted) self.cur_altered.append(int(g)) except ValueError: # not a num if self._check_cmarker(g): self.cmarker_count += 1 if self.cmarker_count >= 4 and self.current_action.get_action() == 'reversal': self._flag_reset() self.current_action.add_after(self.cur_container) self.current_action.add_after_altered(self.cur_altered) self.cur_altered = [] self.cur_container = [] elif self.cmarker_count == 3 and self.current_action.get_action() == 'reversal': self.after = True self.current_action.add_before(self.cur_container) self.current_action.add_before_altered(self.cur_altered) self.cur_altered = [] self.cur_container = [] elif self.cmarker_count == 5: self.after = True self.current_action.add_before(self.cur_container) self.current_action.add_before_altered(self.cur_altered) self.cur_altered = [] self.cur_container = [] elif self.cmarker_count == 3: self.current_action.add_before(self.cur_container) self.current_action.add_before_altered(self.cur_altered) self.cur_altered = [] self.cur_container = [] elif self.cmarker_count == 7: self.current_action.add_after(self.cur_container) self.current_action.add_after_altered(self.cur_altered) self.cur_altered = [] self.cur_container = [] elif self.cmarker_count == 8: self._flag_reset() self.current_action.add_after(self.cur_container) self.current_action.add_after_altered(self.cur_altered) self.cur_altered = [] self.cur_container = [] self.prev_line = inchunk def _flag_reset(self): """Resets all flags in the class""" self.action_found = False self.max_found = False self.after = False self.cmarker_found = False self.cmarker_count = 0 def _check_ends(self, end): """checks to see if the string passed is either an integer that exceeds the max, or is a cmarker end mark end -- string to check returns True if the string is an end marker, False otherwise""" return self._check_max(end) or self._check_cmarker(end) def _check_action(self, strval): """checks to see if the string pased is a rearrangement action ('reversal', 'fission', 'fusion', 'translocation') strval -- string to check returns True if a rearrangement action, false otherwise""" strval = strval.rstrip().lower() if strval == 'reversal' or strval == 'translocation' or strval == 'fission' or strval == 'fusion': return True return False def _check_max(self, gennum): """checks to see if the integer passed exceeds the maximum gene number gennum -- number to check returns True if exceeds, False otherwise""" try: if gennum >= self.max: return True except TypeError: pass return False def _check_cmarker(self, mark): """checks to see if the string passed is a cmarker mark -- string to check returns True if cmarker, False otherwise""" if mark == "c" or mark == "-c": return True return False # get GRIMM actions def get_grimm_actions(self): return self.grimm_actions # class to store actions class GRIMMAction: # constructor """GRIMMAction constructor""" def __init__(self): self.before = [] self.after = [] self.before_altered = [] self.after_altered = [] self.action = "" self.step = "" def __str__(self): """provides formatting for 'print'ing""" return '(' + self.step + ', ' + self.action + ')' __repr = __str__ def add_action(self, action): self.action = action def add_step(self, step): self.step = step def get_action(self): return self.action def get_step(self): return self.step def add_before(self, bef): self.before.append(bef) def add_after(self, af): self.after.append(af) def get_before(self): return self.before def get_after(self): return self.after def add_before_altered(self, ba): self.before_altered.append(ba) #self.before_altered.extend(consec_list(ba)) def add_after_altered(self, aa): self.after_altered.append(aa) #self.after_altered.extend(consec_list(aa)) def get_before_altered(self): return self.before_altered def get_after_altered(self): return self.after_altered ############ ## Testing ############ # main if __name__ == '__main__': def pretty_trace(): """User interaction method - allows user to select a chromosome to print out, and traces that chromosome through GRIMMActions""" qry = raw_input("Please enter the chr/contig to search for (ex. Lmjf_01_01): ") result = network.trace_chromosome(qry) # retrieve the network for t in tb: if t.get_chrcontig().find(qry) >= 0: prt = "Base chr/contig: " h = num_short(t.get_chromosome()) for z in h: prt += z + " " print prt for l in lmjf: if l.get_chrcontig().find(qry) >= 0: prt = "Base chr/contig: " h = num_short(l.get_chromosome()) for z in h: prt += z + " " print prt for x in result: print "\nStep " + x.get_grimmaction().get_step() + ", action " + x.get_grimmaction().get_action() print "---------------------------" print "Before: " for j in x.get_grimmaction().get_before(): if len(j) > 0: h = num_short(j) # shorten the chromosomes into contiguous blocks pstr = "+ " for t in h: pstr += t + " " print pstr print "After: " for k in x.get_grimmaction().get_after(): if len(k) > 0: h = num_short(k) pstr = "+ " for t in h: pstr += t + " " print pstr print "Altered Genes: " for a in x.get_grimmaction().get_before_altered(): if len(a) > 0: h = num_short(a) pstr = "" for t in h: pstr += t + " " print pstr print "Relevant Chromosomes: " for v in x.get_annotations(): pstr = str(v.get_chrcontig() + ": ") h = num_short(v.get_chromosome()) for p in h: pstr += p + " " print pstr pretty_trace() genes = open_grimm('tb.txt', '02_01') # save the genes from Tb02_01 f = locate('lmjf.txt', genes) # find the corresponding chr/contigs that contain genes from Tb02_01 filehandle = open('index.html', 'r') # open the GRIMM results parser = GRIMMScraper(475) while True: # read in the HTML data data = filehandle.read(10000) if not data: break parser.feed(data) actionlist = parser.get_grimm_actions() # compile the GRIMMAction list processedfile = parser.close() filehandle.close() relevants = get_relevant_actions(genes,actionlist) # get all the GRIMMActions that are relevant for Tb_02_01 lmjf = load_chrcontig('lmjf.txt', []) tb = load_chrcontig('tb.txt', []) # build the RearrangementNetwork network = RearrangementNetwork(actionlist, tb, lmjf) pretty_trace()