Source code for integron_finder.integron

# -*- coding: utf-8 -*-

####################################################################################
# Integron_Finder - Integron Finder aims at detecting integrons in DNA sequences   #
# by finding particular features of the integron:                                  #
#   - the attC sites                                                               #
#   - the integrase                                                                #
#   - and when possible attI site and promoters.                                   #
#                                                                                  #
# Authors: Jean Cury, Bertrand Neron, Eduardo PC Rocha                             #
# Copyright (c) 2015 - 2018  Institut Pasteur, Paris and CNRS.                     #
# See the COPYRIGHT file for details                                               #
#                                                                                  #
# integron_finder is free software: you can redistribute it and/or modify          #
# it under the terms of the GNU General Public License as published by             #
# the Free Software Foundation, either version 3 of the License, or                #
# (at your option) any later version.                                              #
#                                                                                  #
# integron_finder is distributed in the hope that it will be useful,               #
# but WITHOUT ANY WARRANTY; without even the implied warranty of                   #
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the                    #
# GNU General Public License for more details.                                     #
#                                                                                  #
# You should have received a copy of the GNU General Public License                #
# along with this program (COPYING file).                                          #
# If not, see <http://www.gnu.org/licenses/>.                                      #
####################################################################################

import os
import colorlog
import numpy as np
import pandas as pd

from matplotlib import use as m_use
m_use("Agg")
import matplotlib.pyplot as plt
import matplotlib.colors

from Bio import Seq
from Bio import SeqIO
from Bio import motifs

from .hmm import read_hmm
from .infernal import read_infernal
from .attc import search_attc

_log = colorlog.getLogger(__name__)


[docs]def find_integron(replicon, prot_db, attc_file, intI_file, phageI_file, cfg): """ Function that looks for integrons given rules : * presence of intI * presence of attC * d(intI-attC) <= 4 kb * d(attC-attC) <= 4 kb It returns the list of all integrons, be they complete or not. found in attC files + integrases file which are formatted as follow : intI_file : Accession_number ID_prot strand pos_beg pos_end evalue attc_file : Accession_number attC cm_debut cm_fin pos_beg pos_end sens evalue :param replicon: the name of the replicon :type replicon: :class:`Bio.Seq.SeqRecord` object :param prot_db: the protein database corresponding to the replicon translation :type prot_db: a :class:`integron_finder.prot_db.ProteinDB` object. :param attc_file: the output of cmsearch or the result of parsing of this file by read_infernal :type attc_file: path to cmsearch output or :class:`pd.Dataframe` :param str intI_file: the output of hmmsearch with the integrase model :param str phageI_file: the output of hmmsearch with the phage model :param cfg: configuration :type cfg: a :class:`integron_finder.config.Config` object :returns: list of all integrons, be they complete or not :retype: list of :class:`Integron` object """ if not cfg.no_proteins: intI = read_hmm(replicon.id, prot_db, intI_file, cfg) intI.sort_values(["Accession_number", "pos_beg", "evalue"], inplace=True) phageI = read_hmm(replicon.id, prot_db, phageI_file, cfg) phageI.sort_values(["Accession_number", "pos_beg", "evalue"], inplace=True) tmp = intI[intI.ID_prot.isin(phageI.ID_prot)].copy() if not tmp.empty: tmp.loc[:, "query_name"] = "intersection_tyr_intI" if cfg.union_integrases: intI_ac = intI[intI.ID_prot.isin(tmp.ID_prot) == 0].merge(phageI[phageI.ID_prot.isin(tmp.ID_prot) == 0], how="outer").merge(tmp, how="outer") else: intI_ac = tmp else: intI_ac = pd.DataFrame(columns=["Accession_number", "query_name", "ID_query", "ID_prot", "strand", "pos_beg", "pos_end", "evalue", "hmmfrom", "hmmto", "alifrom", "alito", "len_profile"]) # if attc_file is integron_max if isinstance(attc_file, pd.DataFrame): local_max_done = True attc = attc_file attc.sort_values(["Accession_number", "pos_beg", "evalue"], inplace=True) # else it the call after default search else: local_max_done = False attc = read_infernal(attc_file, replicon.id, cfg.model_len, evalue=cfg.evalue_attc, size_max_attc=cfg.max_attc_size, size_min_attc=cfg.min_attc_size) attc.sort_values(["Accession_number", "pos_beg", "evalue"], inplace=True) # attc_ac = list of Dataframe, each have a an array of attC attc_ac = search_attc(attc, cfg.keep_palindromes, cfg.distance_threshold, len(replicon)) integrons = [] if not intI_ac.empty and attc_ac: n_attc_array = len(attc_ac) # If an array hasn't been clustered with an Integrase # or if an integrase lacks an array # redundant info, we could check for len(attc_ac)==0 # -> to remove for i, id_int in enumerate(intI_ac.ID_prot.values): # For each Integrase if n_attc_array == 0: # No more array to attribute to an integrase integrons.append(Integron(replicon, cfg)) integrons[-1].add_integrase(intI_ac.pos_beg.values[i], intI_ac.pos_end.values[i], id_int, int(intI_ac.strand.values[i]), intI_ac.evalue.values[i], intI_ac.query_name.values[i]) else: # we still have attC and int : attc_left = np.array([i_attc.pos_beg.values[0] for i_attc in attc_ac]) attc_right = np.array([i_attc.pos_end.values[-1] for i_attc in attc_ac]) if replicon.topology == 'circ': distances = np.array([(attc_left - intI_ac.pos_end.values[i]), (intI_ac.pos_beg.values[i] - attc_right)]) % len(replicon) else: distances = np.array([abs(attc_left - intI_ac.pos_end.values[i]), abs(intI_ac.pos_beg.values[i] - attc_right)]) if attc_ac: # tmp = (distances / # np.array([[len(aac) for aac in attc_ac]])) side, idx_attc = np.where(distances == distances.min()) # side : 0 <=> left; 1 <=> right # index of the closest and biggest attC array to the integrase # exactly tmp = dist(cluster to integrase) / size cluster # to make a decision between 2 equally distant arrays # Usually they are on the same side but on 2 different strands # If they are exactly similar (same distance, same number of attC, take the first one arbitrarily # Or just flatten from idx_attc=[i] to idx_attc=i idx_attc = idx_attc[0] side = side[0] else: idx_attc = 0 side = np.argmin(distances) if distances[side, idx_attc] < cfg.distance_threshold: integrons.append(Integron(replicon, cfg)) integrons[-1].add_integrase(intI_ac.pos_beg.values[i], intI_ac.pos_end.values[i], id_int, int(intI_ac.strand.values[i]), intI_ac.evalue.values[i], intI_ac.query_name.values[i]) attc_tmp = attc_ac.pop(idx_attc) for a_tmp in attc_tmp.values: integrons[-1].add_attC(a_tmp[4], a_tmp[5], 1 if a_tmp[6] == "+" else -1, a_tmp[7], cfg.model_attc_name) n_attc_array -= 1 else: # no array close to the integrase on both side integrons.append(Integron(replicon, cfg)) integrons[-1].add_integrase(intI_ac.pos_beg.values[i], intI_ac.pos_end.values[i], id_int, int(intI_ac.strand.values[i]), intI_ac.evalue.values[i], intI_ac.query_name.values[i]) if n_attc_array > 0: # after the integrase loop (<=> no more integrases) for attc_array in attc_ac: integrons.append(Integron(replicon, cfg)) for a_tmp in attc_array.values: integrons[-1].add_attC(a_tmp[4], a_tmp[5], 1 if a_tmp[6] == "+" else -1, a_tmp[7], cfg.model_attc_name) elif intI_ac.pos_end.values.size == 0 and attc_ac: # If attC only for attc_array in attc_ac: integrons.append(Integron(replicon, cfg)) for a_tmp in attc_array.values: integrons[-1].add_attC(a_tmp[4], a_tmp[5], 1 if a_tmp[6] == "+" else -1, a_tmp[7], cfg.model_attc_name) elif intI_ac.pos_end.values.size >= 1 and not attc_ac: # If intI only for i, id_int in enumerate(intI_ac.ID_prot.values): integrons.append(Integron(replicon, cfg)) integrons[-1].add_integrase(intI_ac.pos_beg.values[i], intI_ac.pos_end.values[i], id_int, int(intI_ac.strand.values[i]), intI_ac.evalue.values[i], intI_ac.query_name.values[i]) ######################################### # filter CALIN integron on attc number # ######################################### # Only after local_max if it will be called if (cfg.local_max and local_max_done) or not cfg.local_max: _log.debug("filter out 'CALIN' with less attC sites than {}".format(cfg.calin_threshold)) integrons = [i for i in integrons if i.type() != 'CALIN' or len(i.attC) >= cfg.calin_threshold] ############### # log summary # ############### _log.info("In replicon {}, there are:".format(replicon.id)) _log.info("- {} complete integron(s) found with a total {} attC site(s)".format(sum( [1 if i.type() == "complete" else 0 for i in integrons]), sum([len(i.attC) if i.type() == "complete" else 0 for i in integrons]))) _log.info("- {} CALIN element(s) found with a total of {} attC site(s)".format(sum( [1 if i.type() == "CALIN" else 0 for i in integrons]), sum([len(i.attC) if i.type() == "CALIN" else 0 for i in integrons]))) _log.info("- {} In0 element(s) found with a total of {} attC site".format(sum( [1 if i.type() == "In0" else 0 for i in integrons]), sum([len(i.attC) if i.type() == "In0" else 0 for i in integrons]))) return integrons
[docs]class Integron(object): """Integron object represents an object composed of an integrase, attC sites and gene cassettes. Each element is characterized by their coordinates in the replicon, the strand (+ or -), the ID of the gene (except attC). The object Integron is also characterized by the ID of the replicon."""
[docs] def __init__(self, replicon, cfg): """ :param replicon: The replicon where integrons has been found :type replicon: a :class:`Bio.Seq.SeqRecord` object :param cfg: the configuration :type cfg: a :class:`integron_finder.config.Config` object """ self.cfg = cfg self.replicon = replicon self.replicon_size = len(self.replicon) self._columns = ["pos_beg", "pos_end", "strand", "evalue", "type_elt", "model", "distance_2attC", "annotation"] self._dtype = {"pos_beg": "int", "pos_end": "int", "strand": "int", "evalue": "float", "type_elt": "str", "model": "str", "distance_2attC": "float", "annotation": "str"} self.integrase = pd.DataFrame(columns=self._columns) self.integrase = self.integrase.astype(dtype=self._dtype) self.attC = pd.DataFrame(columns=self._columns) self.attC = self.attC.astype(dtype=self._dtype) self.promoter = pd.DataFrame(columns=self._columns) self.promoter = self.promoter.astype(dtype=self._dtype) self.attI = pd.DataFrame(columns=self._columns) self.attI = self.attI.astype(dtype=self._dtype) self.proteins = pd.DataFrame(columns=self._columns) self.proteins = self.proteins.astype(dtype=self._dtype) self.sizes_cassettes = None
@property def dtype(self): return {k: v for k, v in self._dtype.items()}
[docs] def add_integrase(self, pos_beg_int, pos_end_int, id_int, strand_int, evalue, model): """Adds integrases to the integron. Should be called once. :param int pos_beg_int: the position on the replicon of the beginning integrase site :param int pos_end_int: the position on replicon of the end of the integrase site :param str id_int: The protein id corresponding to the integrase :param int strand_int: the strand where is found the attc 1 for forward, -1 for reverse :param float evalue: the evalue associated to this attc site :param str model: the name of integrase model (for instance intersection_tyr_intI) """ if not self.integrase.empty: raise RuntimeError("add_integrase should be called once.") tmp_df = pd.DataFrame(columns=self._columns) tmp_df = tmp_df.astype(dtype=self._dtype) tmp_df["pos_beg"] = [pos_beg_int] tmp_df["pos_end"] = [pos_end_int] tmp_df["strand"] = [strand_int] tmp_df["evalue"] = [evalue] tmp_df["type_elt"] = "protein" tmp_df["annotation"] = "intI" tmp_df["model"] = [model] tmp_df.index = [id_int] tmp_df["distance_2attC"] = [np.nan] self.integrase = self.integrase.append(tmp_df)
[docs] def add_attC(self, pos_beg_attC, pos_end_attC, strand, evalue, model): """Adds attC site to the Integron object. :param int pos_beg_attC: the position on the replicon of the beginning attc site :param int pos_end_attC: the position on replicon of the end of the attc site :param int strand: the strand where is found the attc 1 for forward, -1 for reverse :param float evalue: the evalue associated to this attc site :param str model: the name of attc model (for instance attc4) """ tmp_df = pd.DataFrame(columns=self._columns) tmp_df = tmp_df.astype(dtype=self._dtype) tmp_df["pos_beg"] = [pos_beg_attC] tmp_df["pos_end"] = [pos_end_attC] tmp_df["strand"] = [strand] tmp_df["evalue"] = [evalue] tmp_df["type_elt"] = "attC" tmp_df["annotation"] = "attC" tmp_df["model"] = [model] self.attC = self.attC.append(tmp_df, ignore_index=True) attC_len = len(self.attC) if attC_len < 2: self.sizes_cassettes = [np.nan] else: self.sizes_cassettes.append((self.attC.iloc[attC_len - 1].pos_beg - self.attC.iloc[attC_len - 2].pos_end) % len(self.replicon)) self.attC["distance_2attC"] = self.sizes_cassettes # self.attC.sort_values(["pos_beg"], inplace = True) self.attC.index = ["attc_%03i" % int(j + 1) for j in self.attC.index]
[docs] def type(self): """ :returns: The type of the integrons: - 'complete' : Have one integrase and at least one attC - 'CALIN' : Have at least one attC - 'In0' : Just an integrase intI :rtype: str """ if not self.attC.empty and not self.integrase.empty: return "complete" elif self.attC.empty and not self.integrase.empty: return "In0" elif not self.attC.empty and self.integrase.empty: return "CALIN"
[docs] def add_promoter(self): """ Looks for known promoters if they exists within your integrons element. It takes 1s for about 13kb. """ dist_prom = 500 # pb distance from edge of the element for which we seek promoter ######## Promoter of integrase ######### if self.has_integrase(): # PintI1 p_intI1 = motifs.create([Seq.Seq("TTGCTGCTTGGATGCCCGAGGCATAGACTGTACA")]) p_intI1.name = "P_intI1" # PintI2 # Not known # PintI3 # Not known motifs_Pint = [p_intI1] seq_p_int = self.replicon.seq[int(self.integrase.pos_beg.min()) - dist_prom: int(self.integrase.pos_end.max()) + dist_prom] for m in motifs_Pint: if self.integrase.strand.values[0] == 1: generator_motifs = m.instances.search(seq_p_int[:dist_prom]) for pos, s in generator_motifs: tmp_df = pd.DataFrame(columns=self._columns) tmp_df = tmp_df.astype(dtype=self._dtype) tmp_df["pos_beg"] = [self.integrase.pos_beg.values[0] - dist_prom + pos] tmp_df["pos_end"] = [self.integrase.pos_beg.values[0] - dist_prom + pos + len(s)] tmp_df["strand"] = [self.integrase.strand.values[0]] tmp_df["evalue"] = [np.nan] tmp_df["type_elt"] = "Promoter" tmp_df["annotation"] = "Pint_%s" %(m.name[-1]) tmp_df["model"] = "NA" tmp_df.index = [m.name] tmp_df["distance_2attC"] = [np.nan] self.promoter = self.promoter.append(tmp_df) else: generator_motifs = m.instances.reverse_complement().search(seq_p_int[-dist_prom:]) for pos, s in generator_motifs: tmp_df = pd.DataFrame(columns=self._columns) tmp_df = tmp_df.astype(dtype=self._dtype) tmp_df["pos_beg"] = [self.integrase.pos_end.max() + pos] tmp_df["pos_end"] = [self.integrase.pos_end.max() + pos + len(s)] tmp_df["strand"] = [self.integrase.strand.values[0]] tmp_df["evalue"] = [np.nan] tmp_df["type_elt"] = "Promoter" tmp_df["annotation"] = "Pint_%s" % (m.name[-1]) tmp_df["model"] = "NA" tmp_df.index = [m.name] tmp_df["distance_2attC"] = [np.nan] self.promoter = self.promoter.append(tmp_df) ######## Promoter of K7 ######### # Pc-int1 motifs_Pc = [] pc = SeqIO.parse(os.path.join(self.cfg.model_dir, "variants_Pc_intI1.fst"), "fasta") pseq = [i for i in pc] d = {len(i): [] for i in pseq} _ = [d[len(i)].append(i.seq.upper()) for i in pseq] for k, i in d.items(): motifs_Pc.append(motifs.create(i)) motifs_Pc[-1].name = "Pc_int1" # Pc-int2 # Not known # Pc-int3 pc_intI3 = motifs.create([Seq.Seq("TAGACATAAGCTTTCTCGGTCTGTAGGCTGTAATG"), Seq.Seq("TAGACATAAGCTTTCTCGGTCTGTAGGATGTAATG")]) pc_intI3.name = "Pc_int3" motifs_Pc.append(pc_intI3) if self.type() == "complete": if ((self.attC.pos_beg.values[0] - self.integrase.pos_end.values[0]) % self.replicon_size > (self.integrase.pos_beg.values[0] - self.attC.pos_end.values[-1]) % self.replicon_size): # if integrase after attcs (on the right) left = int(self.attC.pos_end.values[-1]) right = int(self.integrase.pos_beg.values[0]) else: left = int(self.integrase.pos_end.values[-1]) right = int(self.attC.pos_beg.values[0]) strand_array = self.attC.strand.unique()[0] elif self.type() == "In0": left = int(self.integrase.pos_beg.values[0]) right = int(self.integrase.pos_end.values[-1]) strand_array = "both" elif self.type() == "CALIN": left = int(self.attC.pos_beg.values[0]) right = int(self.attC.pos_end.values[-1]) strand_array = self.attC.strand.unique()[0] if left < right: seq_Pc = self.replicon.seq[left - dist_prom:right + dist_prom] else: seq_Pc1 = self.replicon.seq[left - dist_prom:self.replicon_size] seq_Pc2 = self.replicon.seq[:right + dist_prom] seq_Pc = seq_Pc1 + seq_Pc2 for m in motifs_Pc: if strand_array == 1: mot = [m] elif strand_array == "both": mot = [m.reverse_complement(), m] else: mot = [m.reverse_complement()] for sa, mo in enumerate(mot): for pos, s in mo.instances.search(seq_Pc): tmp_df = pd.DataFrame(columns=self._columns) tmp_df = tmp_df.astype(dtype=self._dtype) tmp_df["pos_beg"] = [(left - dist_prom + pos) % self.replicon_size] tmp_df["pos_end"] = [(left - dist_prom + pos + len(s)) % self.replicon_size] tmp_df["strand"] = [strand_array] if strand_array != "both" else [sa * 2 - 1] tmp_df["evalue"] = [np.nan] tmp_df["type_elt"] = "Promoter" tmp_df["annotation"] = "Pc_%s" % (m.name[-1]) tmp_df["model"] = "NA" tmp_df.index = [m.name] tmp_df["distance_2attC"] = [np.nan] self.promoter = self.promoter.append(tmp_df)
[docs] def add_attI(self): """ Looking for Att1 sites and add them to this integron. """ dist_atti = 500 # attI1 instances_attI1 = [Seq.Seq('TGATGTTATGGAGCAGCAACGATGTTACGCAGCAGGGCAGTCGCCCTAAAACAAAGTT')] attI1 = motifs.create(instances_attI1) attI1.name = "attI1" # attI2 instances_attI2 = [Seq.Seq('TTAATTAACGGTAAGCATCAGCGGGTGACAAAACGAGCATGCTTACTAATAAAATGTT')] attI2 = motifs.create(instances_attI2) attI2.name = "attI2" # attI3 instances_attI3 = [Seq.Seq('CTTTGTTTAACGACCACGGTTGTGGGTATCCGGTGTTTGGTCAGATAAACCACAAGTT')] attI3 = motifs.create(instances_attI3) attI3.name = "attI3" motif_attI = [attI1, attI2, attI3] if self.type() == "complete": if ((self.attC.pos_beg.values[0] - self.integrase.pos_end.values[0]) % self.replicon_size > (self.integrase.pos_beg.values[0] - self.attC.pos_end.values[-1]) % self.replicon_size): # if integrase after attcs (on the right) left = int(self.attC.pos_end.values[-1]) right = int(self.integrase.pos_beg.values[0]) else: left = int(self.integrase.pos_end.values[-1]) right = int(self.attC.pos_beg.values[0]) strand_array = self.attC.strand.unique()[0] elif self.type() == "In0": left = int(self.integrase.pos_beg) right = int(self.integrase.pos_end) strand_array = "both" elif self.type() == "CALIN": left = int(self.attC.pos_beg.values[0]) right = int(self.attC.pos_end.values[-1]) strand_array = self.attC.strand.unique()[0] if left < right: seq_attI = self.replicon.seq[left - dist_atti:right + dist_atti] else: seq_attI1 = self.replicon.seq[left - dist_atti:self.replicon_size] seq_attI2 = self.replicon.seq[:right + dist_atti] seq_attI = seq_attI1 + seq_attI2 for m in motif_attI: if strand_array == 1: mot = [m] elif strand_array == "both": mot = [m.reverse_complement(), m] else: mot = [m.reverse_complement()] for sa, mo in enumerate(mot): for pos, s in mo.instances.search(seq_attI): tmp_df = pd.DataFrame(columns=self._columns) tmp_df = tmp_df.astype(dtype=self._dtype) tmp_df["pos_beg"] = [(left - dist_atti + pos) % self.replicon_size] tmp_df["pos_end"] = [(left - dist_atti + pos + len(s)) % self.replicon_size] tmp_df["strand"] = [strand_array] if strand_array != "both" else [sa * 2 - 1] tmp_df["evalue"] = [np.nan] tmp_df["type_elt"] = "attI" tmp_df["annotation"] = "attI_%s" % (m.name[-1]) tmp_df["model"] = "NA" tmp_df.index = [m.name] tmp_df["distance_2attC"] = [np.nan] self.attI = self.attI.append(tmp_df)
[docs] def add_proteins(self, prot_db): """ :param prot_db: The protein db corresponding to the translation of the replicon :type prot_db: :class:`integron.prot_db.ProteinDB` object. """ attc_start = self.attC.pos_beg.values[0] attc_end = self.attC.pos_end.values[-1] if self.has_integrase(): if ((attc_start - self.integrase.pos_end.values[0]) % self.replicon_size > (self.integrase.pos_beg.values[0] - attc_end) % self.replicon_size): # integrase on the right of attC cluster. window_start = attc_start - 200 window_end = self.integrase.pos_beg.min() else: window_start = self.integrase.pos_end.max() window_end = attc_end + 200 else: # To allow the first protein after last attC to aggregate. window_start = attc_start - 200 window_end = attc_end + 200 for prot_id in prot_db: prot_attr = prot_db.get_description(prot_id) s_int = (window_end - window_start) % self.replicon_size if ((window_end - prot_attr.stop) % self.replicon_size < s_int) or \ ((prot_attr.start - window_start) % self.replicon_size < s_int): # We keep proteins (<--->) if start (<) and end (>) follows that scheme: # # ok: <---> <---> # ok: <---> <---> # ^ 200pb v v 200pb ^ # |------integron------| # window_start fin prot_annot = "protein" prot_evalue = np.nan prot_model = "NA" self.proteins.loc[prot_attr.id] = [prot_attr.start, prot_attr.stop, prot_attr.strand, prot_evalue, "protein", prot_model, np.nan, prot_annot] intcols = ["pos_beg", "pos_end", "strand"] floatcols = ["evalue", "distance_2attC"] self.proteins[intcols] = self.proteins[intcols].astype(int) self.proteins[floatcols] = self.proteins[floatcols].astype(float)
[docs] def describe(self): """ :returns: DataFrame describing the integron object The columns are: "pos_beg", "pos_end", "strand", "evalue", "type_elt", "model", "distance_2attC", "annotation", "considered_topology" """ full = pd.concat([self.integrase, self.attC, self.promoter, self.attI, self.proteins]) full["pos_beg"] = full["pos_beg"].astype(int) full["pos_end"] = full["pos_end"].astype(int) full["strand"] = full["strand"].astype(int) full["distance_2attC"] = full["distance_2attC"].astype(float) full = full.reset_index() full.columns = ["element"] + list(full.columns[1:]) full["type"] = self.type() full["ID_replicon"] = self.replicon.id full["ID_integron"] = id(self) # uniq identifier of a given Integron full["default"] = "Yes" if not self.cfg.local_max else "No" try: # when replicon has been got using utils.FastaIterator full["considered_topology"] = self.replicon.topology except AttributeError: # if replicon is a bare Bio.SeqRecord full["considered_topology"] = self.cfg.default_topology full.drop_duplicates(subset=["element"], inplace=True) return full
[docs] def draw_integron(self, file=None): """ Represent the different element of the integrons if file is provide save the drawing on the file otherwise display it on screen. :param str file: the path to save the integron schema (in pdf format) """ full = self.describe() full["evalue"] = full["evalue"].astype("float") h = [i + (0.5*i) if j == "Promoter" else i for i, j in zip(full.strand, full.type_elt)] fig, ax = plt.subplots(1, 1, figsize=(16, 9)) alpha = [i if i < 1 else 1 for i in ( (np.log10(full.evalue) - np.ones(len(full)) * -1) / (np.ones(len(full)) * -10 - np.ones(len(full)) * -1) * (1 - 0.2) + 0.2).fillna(1).tolist()] # normalize alpha value with 0.2 as min value colors = ["#749FCD" if i == "attC" else "#DD654B" if i == "intI" else "#6BC865" if (i[-2:] == "_1" and j == "Promoter") else "#D06CC0" if (i[-2:] == "_2" and j == "Promoter") else "#C3B639" if (i[-2:] == "_3" and j == "Promoter") else "#e8950e" if i != "protein" else "#d3d3d3" for (i, j) in zip(full.annotation, full.type_elt)] # colors_alpha = [j+[i] for j, i in zip([[ord(c) / 255. for c in i[1:].decode("hex")] for i in colors], # alpha)] colors_alpha = [matplotlib.colors.to_rgba_array(c, a)[0].tolist() for c, a in zip(colors, alpha)] # ec = ["red" if i =="attC" else # "white" for i in full.type_elt] # z_order = [100 if i == "attC" else 1 for i in full.type_elt] z_order = 10 ax.barh(np.zeros(len(full)), full.pos_end-full.pos_beg, height=h, left=full.pos_beg, color=colors_alpha, zorder=z_order, ec=None) # edgecolor=ec, xlims = ax.get_xlim() for c, l in zip(["#749FCD", "#DD654B", "#6BC865", "#D06CC0", "#C3B639", "#e8950e", "#d3d3d3"], ["attC", "integrase", "Promoter/attI class 1", "Promoter/attI class 2", "Promoter/attI class 3", "Functional Annotation", "Hypothetical Protein"]): ax.bar(0, 0, color=c, label=l) plt.legend(loc=[1.01, 0.4]) ax.set_xlim(xlims) fig.subplots_adjust(left=0.05, right=0.80) ax.hlines(0, ax.get_xlim()[0], ax.get_xlim()[1], "lightgrey", "--") ax.grid(True, "major", axis="x") ax.set_ylim(-4, 4) ax.get_yaxis().set_visible(False) if file: fig.savefig(file, format="pdf") plt.close(fig) else: fig.show()
[docs] def has_integrase(self): """ :return: True if integron has integrase False otherwise. """ return not self.integrase.empty
[docs] def has_attC(self): """ :return: True if integron has attc sites False otherwise. """ return not self.attC.empty