# -*- coding: utf-8 -*-
####################################################################################
# Integron_Finder - Integron Finder aims at detecting integrons in DNA sequences #
# by finding particular features of the integron: #
# - the attC sites #
# - the integrase #
# - and when possible attI site and promoters. #
# #
# Authors: Jean Cury, Bertrand Neron, Eduardo PC Rocha #
# Copyright (c) 2015 - 2018 Institut Pasteur, Paris and CNRS. #
# See the COPYRIGHT file for details #
# #
# integron_finder is free software: you can redistribute it and/or modify #
# it under the terms of the GNU General Public License as published by #
# the Free Software Foundation, either version 3 of the License, or #
# (at your option) any later version. #
# #
# integron_finder is distributed in the hope that it will be useful, #
# but WITHOUT ANY WARRANTY; without even the implied warranty of #
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the #
# GNU General Public License for more details. #
# #
# You should have received a copy of the GNU General Public License #
# along with this program (COPYING file). #
# If not, see <http://www.gnu.org/licenses/>. #
####################################################################################
import os
import colorlog
import numpy as np
import pandas as pd
from matplotlib import use as m_use
m_use("Agg")
import matplotlib.pyplot as plt
import matplotlib.colors
from Bio import Seq
from Bio import SeqIO
from Bio import motifs
from .hmm import read_hmm
from .infernal import read_infernal
from .attc import search_attc
_log = colorlog.getLogger(__name__)
[docs]def find_integron(replicon, prot_db, attc_file, intI_file, phageI_file, cfg):
"""
Function that looks for integrons given rules :
* presence of intI
* presence of attC
* d(intI-attC) <= 4 kb
* d(attC-attC) <= 4 kb
It returns the list of all integrons, be they complete or not.
found in attC files + integrases file which are formatted as follow :
intI_file : Accession_number ID_prot strand pos_beg pos_end evalue
attc_file : Accession_number attC cm_debut cm_fin pos_beg pos_end sens evalue
:param replicon: the name of the replicon
:type replicon: :class:`Bio.Seq.SeqRecord` object
:param prot_db: the protein database corresponding to the replicon translation
:type prot_db: a :class:`integron_finder.prot_db.ProteinDB` object.
:param attc_file: the output of cmsearch or the result of parsing of this file by read_infernal
:type attc_file: path to cmsearch output or :class:`pd.Dataframe`
:param str intI_file: the output of hmmsearch with the integrase model
:param str phageI_file: the output of hmmsearch with the phage model
:param cfg: configuration
:type cfg: a :class:`integron_finder.config.Config` object
:returns: list of all integrons, be they complete or not
:retype: list of :class:`Integron` object
"""
if not cfg.no_proteins:
intI = read_hmm(replicon.id, prot_db, intI_file, cfg)
intI.sort_values(["Accession_number", "pos_beg", "evalue"], inplace=True)
phageI = read_hmm(replicon.id, prot_db, phageI_file, cfg)
phageI.sort_values(["Accession_number", "pos_beg", "evalue"], inplace=True)
tmp = intI[intI.ID_prot.isin(phageI.ID_prot)].copy()
if not tmp.empty:
tmp.loc[:, "query_name"] = "intersection_tyr_intI"
if cfg.union_integrases:
intI_ac = intI[intI.ID_prot.isin(tmp.ID_prot) == 0].merge(phageI[phageI.ID_prot.isin(tmp.ID_prot) == 0],
how="outer").merge(tmp, how="outer")
else:
intI_ac = tmp
else:
intI_ac = pd.DataFrame(columns=["Accession_number", "query_name", "ID_query",
"ID_prot", "strand", "pos_beg", "pos_end",
"evalue", "hmmfrom", "hmmto", "alifrom",
"alito", "len_profile"])
# if attc_file is integron_max
if isinstance(attc_file, pd.DataFrame):
local_max_done = True
attc = attc_file
attc.sort_values(["Accession_number", "pos_beg", "evalue"], inplace=True)
# else it the call after default search
else:
local_max_done = False
attc = read_infernal(attc_file, replicon.id, cfg.model_len,
evalue=cfg.evalue_attc,
size_max_attc=cfg.max_attc_size,
size_min_attc=cfg.min_attc_size)
attc.sort_values(["Accession_number", "pos_beg", "evalue"], inplace=True)
# attc_ac = list of Dataframe, each have a an array of attC
attc_ac = search_attc(attc, cfg.keep_palindromes, cfg.distance_threshold, len(replicon))
integrons = []
if not intI_ac.empty and attc_ac:
n_attc_array = len(attc_ac)
# If an array hasn't been clustered with an Integrase
# or if an integrase lacks an array
# redundant info, we could check for len(attc_ac)==0
# -> to remove
for i, id_int in enumerate(intI_ac.ID_prot.values): # For each Integrase
if n_attc_array == 0: # No more array to attribute to an integrase
integrons.append(Integron(replicon, cfg))
integrons[-1].add_integrase(intI_ac.pos_beg.values[i],
intI_ac.pos_end.values[i],
id_int,
int(intI_ac.strand.values[i]),
intI_ac.evalue.values[i],
intI_ac.query_name.values[i])
else: # we still have attC and int :
attc_left = np.array([i_attc.pos_beg.values[0] for i_attc in attc_ac])
attc_right = np.array([i_attc.pos_end.values[-1] for i_attc in attc_ac])
if replicon.topology == 'circ':
distances = np.array([(attc_left - intI_ac.pos_end.values[i]),
(intI_ac.pos_beg.values[i] - attc_right)]) % len(replicon)
else:
distances = np.array([abs(attc_left - intI_ac.pos_end.values[i]),
abs(intI_ac.pos_beg.values[i] - attc_right)])
if attc_ac:
# tmp = (distances /
# np.array([[len(aac) for aac in attc_ac]]))
side, idx_attc = np.where(distances == distances.min())
# side : 0 <=> left; 1 <=> right
# index of the closest and biggest attC array to the integrase
# exactly tmp = dist(cluster to integrase) / size cluster
# to make a decision between 2 equally distant arrays
# Usually they are on the same side but on 2 different strands
# If they are exactly similar (same distance, same number of attC, take the first one arbitrarily
# Or just flatten from idx_attc=[i] to idx_attc=i
idx_attc = idx_attc[0]
side = side[0]
else:
idx_attc = 0
side = np.argmin(distances)
if distances[side, idx_attc] < cfg.distance_threshold:
integrons.append(Integron(replicon, cfg))
integrons[-1].add_integrase(intI_ac.pos_beg.values[i],
intI_ac.pos_end.values[i],
id_int,
int(intI_ac.strand.values[i]),
intI_ac.evalue.values[i],
intI_ac.query_name.values[i])
attc_tmp = attc_ac.pop(idx_attc)
for a_tmp in attc_tmp.values:
integrons[-1].add_attC(a_tmp[4],
a_tmp[5],
1 if a_tmp[6] == "+" else -1,
a_tmp[7], cfg.model_attc_name)
n_attc_array -= 1
else: # no array close to the integrase on both side
integrons.append(Integron(replicon, cfg))
integrons[-1].add_integrase(intI_ac.pos_beg.values[i],
intI_ac.pos_end.values[i],
id_int,
int(intI_ac.strand.values[i]),
intI_ac.evalue.values[i], intI_ac.query_name.values[i])
if n_attc_array > 0: # after the integrase loop (<=> no more integrases)
for attc_array in attc_ac:
integrons.append(Integron(replicon, cfg))
for a_tmp in attc_array.values:
integrons[-1].add_attC(a_tmp[4],
a_tmp[5],
1 if a_tmp[6] == "+" else -1,
a_tmp[7], cfg.model_attc_name)
elif intI_ac.pos_end.values.size == 0 and attc_ac: # If attC only
for attc_array in attc_ac:
integrons.append(Integron(replicon, cfg))
for a_tmp in attc_array.values:
integrons[-1].add_attC(a_tmp[4],
a_tmp[5],
1 if a_tmp[6] == "+" else -1,
a_tmp[7], cfg.model_attc_name)
elif intI_ac.pos_end.values.size >= 1 and not attc_ac: # If intI only
for i, id_int in enumerate(intI_ac.ID_prot.values):
integrons.append(Integron(replicon, cfg))
integrons[-1].add_integrase(intI_ac.pos_beg.values[i],
intI_ac.pos_end.values[i],
id_int,
int(intI_ac.strand.values[i]),
intI_ac.evalue.values[i],
intI_ac.query_name.values[i])
#########################################
# filter CALIN integron on attc number #
#########################################
# Only after local_max if it will be called
if (cfg.local_max and local_max_done) or not cfg.local_max:
_log.debug("filter out 'CALIN' with less attC sites than {}".format(cfg.calin_threshold))
integrons = [i for i in integrons if i.type() != 'CALIN' or len(i.attC) >= cfg.calin_threshold]
###############
# log summary #
###############
_log.info("In replicon {}, there are:".format(replicon.id))
_log.info("- {} complete integron(s) found with a total {} attC site(s)".format(sum(
[1 if i.type() == "complete" else 0 for i in integrons]),
sum([len(i.attC) if i.type() == "complete" else 0 for i in integrons])))
_log.info("- {} CALIN element(s) found with a total of {} attC site(s)".format(sum(
[1 if i.type() == "CALIN" else 0 for i in integrons]),
sum([len(i.attC) if i.type() == "CALIN" else 0 for i in integrons])))
_log.info("- {} In0 element(s) found with a total of {} attC site".format(sum(
[1 if i.type() == "In0" else 0 for i in integrons]),
sum([len(i.attC) if i.type() == "In0" else 0 for i in integrons])))
return integrons
[docs]class Integron(object):
"""Integron object represents an object composed of an integrase, attC sites and gene cassettes.
Each element is characterized by their coordinates in the replicon, the strand (+ or -),
the ID of the gene (except attC).
The object Integron is also characterized by the ID of the replicon."""
[docs] def __init__(self, replicon, cfg):
"""
:param replicon: The replicon where integrons has been found
:type replicon: a :class:`Bio.Seq.SeqRecord` object
:param cfg: the configuration
:type cfg: a :class:`integron_finder.config.Config` object
"""
self.cfg = cfg
self.replicon = replicon
self.replicon_size = len(self.replicon)
self._columns = ["pos_beg", "pos_end", "strand", "evalue", "type_elt", "model", "distance_2attC", "annotation"]
self._dtype = {"pos_beg": "int",
"pos_end": "int",
"strand": "int",
"evalue": "float",
"type_elt": "str",
"model": "str",
"distance_2attC": "float",
"annotation": "str"}
self.integrase = pd.DataFrame(columns=self._columns)
self.integrase = self.integrase.astype(dtype=self._dtype)
self.attC = pd.DataFrame(columns=self._columns)
self.attC = self.attC.astype(dtype=self._dtype)
self.promoter = pd.DataFrame(columns=self._columns)
self.promoter = self.promoter.astype(dtype=self._dtype)
self.attI = pd.DataFrame(columns=self._columns)
self.attI = self.attI.astype(dtype=self._dtype)
self.proteins = pd.DataFrame(columns=self._columns)
self.proteins = self.proteins.astype(dtype=self._dtype)
self.sizes_cassettes = None
@property
def dtype(self):
return {k: v for k, v in self._dtype.items()}
[docs] def add_integrase(self, pos_beg_int, pos_end_int, id_int, strand_int, evalue, model):
"""Adds integrases to the integron. Should be called once.
:param int pos_beg_int: the position on the replicon of the beginning integrase site
:param int pos_end_int: the position on replicon of the end of the integrase site
:param str id_int: The protein id corresponding to the integrase
:param int strand_int: the strand where is found the attc 1 for forward, -1 for reverse
:param float evalue: the evalue associated to this attc site
:param str model: the name of integrase model (for instance intersection_tyr_intI)
"""
if not self.integrase.empty:
raise RuntimeError("add_integrase should be called once.")
tmp_df = pd.DataFrame(columns=self._columns)
tmp_df = tmp_df.astype(dtype=self._dtype)
tmp_df["pos_beg"] = [pos_beg_int]
tmp_df["pos_end"] = [pos_end_int]
tmp_df["strand"] = [strand_int]
tmp_df["evalue"] = [evalue]
tmp_df["type_elt"] = "protein"
tmp_df["annotation"] = "intI"
tmp_df["model"] = [model]
tmp_df.index = [id_int]
tmp_df["distance_2attC"] = [np.nan]
self.integrase = self.integrase.append(tmp_df)
[docs] def add_attC(self, pos_beg_attC, pos_end_attC, strand, evalue, model):
"""Adds attC site to the Integron object.
:param int pos_beg_attC: the position on the replicon of the beginning attc site
:param int pos_end_attC: the position on replicon of the end of the attc site
:param int strand: the strand where is found the attc 1 for forward, -1 for reverse
:param float evalue: the evalue associated to this attc site
:param str model: the name of attc model (for instance attc4)
"""
tmp_df = pd.DataFrame(columns=self._columns)
tmp_df = tmp_df.astype(dtype=self._dtype)
tmp_df["pos_beg"] = [pos_beg_attC]
tmp_df["pos_end"] = [pos_end_attC]
tmp_df["strand"] = [strand]
tmp_df["evalue"] = [evalue]
tmp_df["type_elt"] = "attC"
tmp_df["annotation"] = "attC"
tmp_df["model"] = [model]
self.attC = self.attC.append(tmp_df, ignore_index=True)
attC_len = len(self.attC)
if attC_len < 2:
self.sizes_cassettes = [np.nan]
else:
self.sizes_cassettes.append((self.attC.iloc[attC_len - 1].pos_beg -
self.attC.iloc[attC_len - 2].pos_end) % len(self.replicon))
self.attC["distance_2attC"] = self.sizes_cassettes
# self.attC.sort_values(["pos_beg"], inplace = True)
self.attC.index = ["attc_%03i" % int(j + 1) for j in self.attC.index]
[docs] def type(self):
"""
:returns: The type of the integrons:
- 'complete' : Have one integrase and at least one attC
- 'CALIN' : Have at least one attC
- 'In0' : Just an integrase intI
:rtype: str
"""
if not self.attC.empty and not self.integrase.empty:
return "complete"
elif self.attC.empty and not self.integrase.empty:
return "In0"
elif not self.attC.empty and self.integrase.empty:
return "CALIN"
[docs] def add_attI(self):
"""
Looking for Att1 sites and add them to this integron.
"""
dist_atti = 500
# attI1
instances_attI1 = [Seq.Seq('TGATGTTATGGAGCAGCAACGATGTTACGCAGCAGGGCAGTCGCCCTAAAACAAAGTT')]
attI1 = motifs.create(instances_attI1)
attI1.name = "attI1"
# attI2
instances_attI2 = [Seq.Seq('TTAATTAACGGTAAGCATCAGCGGGTGACAAAACGAGCATGCTTACTAATAAAATGTT')]
attI2 = motifs.create(instances_attI2)
attI2.name = "attI2"
# attI3
instances_attI3 = [Seq.Seq('CTTTGTTTAACGACCACGGTTGTGGGTATCCGGTGTTTGGTCAGATAAACCACAAGTT')]
attI3 = motifs.create(instances_attI3)
attI3.name = "attI3"
motif_attI = [attI1, attI2, attI3]
if self.type() == "complete":
if ((self.attC.pos_beg.values[0] - self.integrase.pos_end.values[0]) % self.replicon_size >
(self.integrase.pos_beg.values[0] - self.attC.pos_end.values[-1]) % self.replicon_size):
# if integrase after attcs (on the right)
left = int(self.attC.pos_end.values[-1])
right = int(self.integrase.pos_beg.values[0])
else:
left = int(self.integrase.pos_end.values[-1])
right = int(self.attC.pos_beg.values[0])
strand_array = self.attC.strand.unique()[0]
elif self.type() == "In0":
left = int(self.integrase.pos_beg)
right = int(self.integrase.pos_end)
strand_array = "both"
elif self.type() == "CALIN":
left = int(self.attC.pos_beg.values[0])
right = int(self.attC.pos_end.values[-1])
strand_array = self.attC.strand.unique()[0]
if left < right:
seq_attI = self.replicon.seq[left - dist_atti:right + dist_atti]
else:
seq_attI1 = self.replicon.seq[left - dist_atti:self.replicon_size]
seq_attI2 = self.replicon.seq[:right + dist_atti]
seq_attI = seq_attI1 + seq_attI2
for m in motif_attI:
if strand_array == 1:
mot = [m]
elif strand_array == "both":
mot = [m.reverse_complement(), m]
else:
mot = [m.reverse_complement()]
for sa, mo in enumerate(mot):
for pos, s in mo.instances.search(seq_attI):
tmp_df = pd.DataFrame(columns=self._columns)
tmp_df = tmp_df.astype(dtype=self._dtype)
tmp_df["pos_beg"] = [(left - dist_atti + pos) % self.replicon_size]
tmp_df["pos_end"] = [(left - dist_atti + pos + len(s)) % self.replicon_size]
tmp_df["strand"] = [strand_array] if strand_array != "both" else [sa * 2 - 1]
tmp_df["evalue"] = [np.nan]
tmp_df["type_elt"] = "attI"
tmp_df["annotation"] = "attI_%s" % (m.name[-1])
tmp_df["model"] = "NA"
tmp_df.index = [m.name]
tmp_df["distance_2attC"] = [np.nan]
self.attI = self.attI.append(tmp_df)
[docs] def add_proteins(self, prot_db):
"""
:param prot_db: The protein db corresponding to the translation of the replicon
:type prot_db: :class:`integron.prot_db.ProteinDB` object.
"""
attc_start = self.attC.pos_beg.values[0]
attc_end = self.attC.pos_end.values[-1]
if self.has_integrase():
if ((attc_start - self.integrase.pos_end.values[0]) % self.replicon_size >
(self.integrase.pos_beg.values[0] - attc_end) % self.replicon_size):
# integrase on the right of attC cluster.
window_start = attc_start - 200
window_end = self.integrase.pos_beg.min()
else:
window_start = self.integrase.pos_end.max()
window_end = attc_end + 200
else:
# To allow the first protein after last attC to aggregate.
window_start = attc_start - 200
window_end = attc_end + 200
for prot_id in prot_db:
prot_attr = prot_db.get_description(prot_id)
s_int = (window_end - window_start) % self.replicon_size
if ((window_end - prot_attr.stop) % self.replicon_size < s_int) or \
((prot_attr.start - window_start) % self.replicon_size < s_int):
# We keep proteins (<--->) if start (<) and end (>) follows that scheme:
#
# ok: <---> <--->
# ok: <---> <--->
# ^ 200pb v v 200pb ^
# |------integron------|
# window_start fin
prot_annot = "protein"
prot_evalue = np.nan
prot_model = "NA"
self.proteins.loc[prot_attr.id] = [prot_attr.start, prot_attr.stop, prot_attr.strand, prot_evalue,
"protein", prot_model, np.nan, prot_annot]
intcols = ["pos_beg", "pos_end", "strand"]
floatcols = ["evalue", "distance_2attC"]
self.proteins[intcols] = self.proteins[intcols].astype(int)
self.proteins[floatcols] = self.proteins[floatcols].astype(float)
[docs] def describe(self):
"""
:returns: DataFrame describing the integron object
The columns are:
"pos_beg", "pos_end", "strand", "evalue", "type_elt", "model",
"distance_2attC", "annotation", "considered_topology"
"""
full = pd.concat([self.integrase, self.attC, self.promoter, self.attI, self.proteins])
full["pos_beg"] = full["pos_beg"].astype(int)
full["pos_end"] = full["pos_end"].astype(int)
full["strand"] = full["strand"].astype(int)
full["distance_2attC"] = full["distance_2attC"].astype(float)
full = full.reset_index()
full.columns = ["element"] + list(full.columns[1:])
full["type"] = self.type()
full["ID_replicon"] = self.replicon.id
full["ID_integron"] = id(self) # uniq identifier of a given Integron
full["default"] = "Yes" if not self.cfg.local_max else "No"
try:
# when replicon has been got using utils.FastaIterator
full["considered_topology"] = self.replicon.topology
except AttributeError:
# if replicon is a bare Bio.SeqRecord
full["considered_topology"] = self.cfg.default_topology
full.drop_duplicates(subset=["element"], inplace=True)
return full
[docs] def draw_integron(self, file=None):
"""
Represent the different element of the integrons if file is provide
save the drawing on the file otherwise display it on screen.
:param str file: the path to save the integron schema (in pdf format)
"""
full = self.describe()
full["evalue"] = full["evalue"].astype("float")
h = [i + (0.5*i) if j == "Promoter" else i for i, j in zip(full.strand, full.type_elt)]
fig, ax = plt.subplots(1, 1, figsize=(16, 9))
alpha = [i if i < 1 else 1 for i in (
(np.log10(full.evalue) - np.ones(len(full)) * -1) /
(np.ones(len(full)) * -10 - np.ones(len(full)) * -1)
* (1 - 0.2) + 0.2).fillna(1).tolist()]
# normalize alpha value with 0.2 as min value
colors = ["#749FCD" if i == "attC" else
"#DD654B" if i == "intI" else
"#6BC865" if (i[-2:] == "_1" and j == "Promoter") else
"#D06CC0" if (i[-2:] == "_2" and j == "Promoter") else
"#C3B639" if (i[-2:] == "_3" and j == "Promoter") else
"#e8950e" if i != "protein" else
"#d3d3d3" for (i, j) in zip(full.annotation,
full.type_elt)]
# colors_alpha = [j+[i] for j, i in zip([[ord(c) / 255. for c in i[1:].decode("hex")] for i in colors],
# alpha)]
colors_alpha = [matplotlib.colors.to_rgba_array(c, a)[0].tolist() for c, a in zip(colors, alpha)]
# ec = ["red" if i =="attC" else
# "white" for i in full.type_elt]
# z_order = [100 if i == "attC" else 1 for i in full.type_elt]
z_order = 10
ax.barh(np.zeros(len(full)), full.pos_end-full.pos_beg,
height=h, left=full.pos_beg,
color=colors_alpha, zorder=z_order, ec=None) # edgecolor=ec,
xlims = ax.get_xlim()
for c, l in zip(["#749FCD", "#DD654B", "#6BC865", "#D06CC0", "#C3B639", "#e8950e", "#d3d3d3"],
["attC", "integrase", "Promoter/attI class 1",
"Promoter/attI class 2", "Promoter/attI class 3",
"Functional Annotation", "Hypothetical Protein"]):
ax.bar(0, 0, color=c, label=l)
plt.legend(loc=[1.01, 0.4])
ax.set_xlim(xlims)
fig.subplots_adjust(left=0.05, right=0.80)
ax.hlines(0, ax.get_xlim()[0], ax.get_xlim()[1], "lightgrey", "--")
ax.grid(True, "major", axis="x")
ax.set_ylim(-4, 4)
ax.get_yaxis().set_visible(False)
if file:
fig.savefig(file, format="pdf")
plt.close(fig)
else:
fig.show()
[docs] def has_integrase(self):
"""
:return: True if integron has integrase False otherwise.
"""
return not self.integrase.empty
[docs] def has_attC(self):
"""
:return: True if integron has attc sites False otherwise.
"""
return not self.attC.empty