Source code for integron_finder.annotation

# -*- coding: utf-8 -*-

####################################################################################
# Integron_Finder - Integron Finder aims at detecting integrons in DNA sequences   #
# by finding particular features of the integron:                                  #
#   - the attC sites                                                               #
#   - the integrase                                                                #
#   - and when possible attI site and promoters.                                   #
#                                                                                  #
# Authors: Jean Cury, Bertrand Neron, Eduardo PC Rocha                             #
# Copyright (c) 2015 - 2018  Institut Pasteur, Paris and CNRS.                     #
# See the COPYRIGHT file for details                                               #
#                                                                                  #
# integron_finder is free software: you can redistribute it and/or modify          #
# it under the terms of the GNU General Public License as published by             #
# the Free Software Foundation, either version 3 of the License, or                #
# (at your option) any later version.                                              #
#                                                                                  #
# integron_finder is distributed in the hope that it will be useful,               #
# but WITHOUT ANY WARRANTY; without even the implied warranty of                   #
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the                    #
# GNU General Public License for more details.                                     #
#                                                                                  #
# You should have received a copy of the GNU General Public License                #
# along with this program (COPYING file).                                          #
# If not, see <http://www.gnu.org/licenses/>.                                      #
####################################################################################

import os
from subprocess import call
import colorlog

import numpy as np
import pandas as pd
from Bio import SeqFeature
from Bio import SeqIO

from .utils import get_name_from_path
from .hmm import read_hmm

_log = colorlog.getLogger(__name__)


[docs]def func_annot(integrons, replicon, prot_db, hmm_files, cfg, out_dir='.', evalue=10, coverage=0.5):
    """
    | Call hmmmer to annotate CDS associated with the integron.
    | Use Resfams per default (Gibson et al, ISME J.,  2014)

    :param integrons: integrons list to annotate
    :type integrons: list of :class:`integron_finder.integron.Integron` objects.
    :param replicon: replicon where the integrons were found (genomic fasta file)
    :type replicon: :class:`Bio.Seq.SeqRecord` object
    :param prot_db: the protein database corresponding to the replicon translation
    :type prot_db: :class:`integron.prot_db.ProteinDB` object.
    :param hmm_files: List of path of hmm profiles to use to scan the prot_file
    :type hmm_files: List[str]
    :param cfg: the configuration for this analyse
    :type cfg: :class:`integron_finder.config.Config`
    :param str out_dir: the path of the directory where to store the results
    :param float evalue:
    :param float coverage:
    :return: None.

             But several files per hmm file are produced.

             * subseqprot.tmp: fasta file containing a subset of protfile (the proteins belonging to the integron)
             * <hmm>_fa.res: an output of the hmm search.
             * <hmm>_fa_table.res: an output of the hmm search in tabulated format.

    """

    prot_tmp = os.path.join(out_dir, replicon.id + "_subseqprot.tmp")

    for integron in integrons:
        if os.path.isfile(prot_tmp):
            os.remove(prot_tmp)

        if integron.type() != "In0" and not integron.proteins.empty:

            func_annotate_res = pd.DataFrame(columns=["Accession_number",
                                                      "query_name", "ID_query",
                                                      "ID_prot", "strand",
                                                      "pos_beg", "pos_end", "evalue"])

            prot_to_annotate = []
            for prot_nb, prot_id in enumerate(prot_db, 1):
                if prot_id in integron.proteins.index:
                    prot_to_annotate.append(prot_db[prot_id])

            SeqIO.write(prot_to_annotate, prot_tmp, "fasta")
            for hmm in hmm_files:
                name_wo_ext = "{}_{}".format(replicon.id, get_name_from_path(hmm))
                hmm_out = os.path.join(out_dir, "{}_fa.res".format(name_wo_ext))
                hmm_tableout = os.path.join(out_dir, "{}_fa_table.res".format(name_wo_ext))
                hmm_cmd = [cfg.hmmsearch,
                           "-Z", str(prot_nb),
                           "--cpu", str(cfg.cpu),
                           "--tblout", hmm_tableout,
                           "-o", hmm_out,
                           hmm,
                           prot_tmp]

                try:
                    _log.debug("run hmmsearch: {}".format(' '.join(hmm_cmd)))
                    returncode = call(hmm_cmd)
                except Exception as err:
                    raise RuntimeError("{0} failed : {1}".format(' '.join(hmm_cmd), err))
                if returncode != 0:
                    raise RuntimeError("{0} failed return code = {1}".format(' '.join(hmm_cmd), returncode))
                hmm_in = read_hmm(replicon.id, prot_db, hmm_out, cfg, evalue=evalue, coverage=coverage
                                  ).sort_values("evalue").drop_duplicates(subset="ID_prot")
                func_annotate_res = pd.concat([func_annotate_res, hmm_in])
            func_annotate_res = func_annotate_res.sort_values("evalue").drop_duplicates(subset="ID_prot")

            integron.proteins.loc[func_annotate_res.ID_prot, "evalue"] = func_annotate_res.evalue.values
            integron.proteins.loc[func_annotate_res.ID_prot, "annotation"] = func_annotate_res.query_name.values
            integron.proteins.loc[func_annotate_res.ID_prot, "model"] = func_annotate_res.ID_query.values
            integron.proteins = integron.proteins.astype(dtype=integron.dtype)


[docs]def add_feature(replicon, integron_desc, prot_db, dist_threshold):
    """
    Add integron annotation to the replicon.

    :param replicon: The Replicon to annotate
    :type replicon: a :class:`Bio.Seq.SeqRecord` object.
    :param integron_desc: integron description
    :type integron_desc: a :class:`pandas.DataFrame`
    :param prot_db: the path to the fasta file containing the translation of the replicon.
    :type prot_db: a :class:`integron_finder.prot_db.ProteinDB` object.
    :param int dist_threshold: Two elements are aggregated if they are distant of dist_threshold or less.
    """
    integron_desc = integron_desc.set_index("ID_integron").copy()
    for i in integron_desc.index.unique():

        if isinstance(integron_desc.loc[i], pd.Series):
            type_integron = integron_desc.loc[i].type
            start_integron = integron_desc.loc[i].pos_beg
            end_integron = integron_desc.loc[i].pos_end
            tmp = SeqFeature.SeqFeature(location=SeqFeature.FeatureLocation(int(start_integron) - 1, int(end_integron)),
                                        strand=0,
                                        type="integron",
                                        qualifiers={"integron_id": i, "integron_type": type_integron}
                                        )
            replicon.features.append(tmp)
            if integron_desc.loc[i].type_elt == "protein":

                tmp = SeqFeature.SeqFeature(location=SeqFeature.FeatureLocation(int(integron_desc.loc[i].pos_beg) - 1,
                                                                                int(integron_desc.loc[i].pos_end)),
                                            strand=integron_desc.loc[i].strand,
                                            type="CDS" if integron_desc.loc[i].annotation != "intI" else "integrase",
                                            qualifiers={"protein_id": integron_desc.loc[i].element,
                                                        "gene": integron_desc.loc[i].annotation,
                                                        "model": integron_desc.loc[i].model}
                                            )
                tmp.qualifiers["translation"] = [prot_db[prt_id] for prt_id in prot_db
                                                 if prt_id == integron_desc.loc[i].element][0].seq
                replicon.features.append(tmp)

            else:
                tmp = SeqFeature.SeqFeature(location=SeqFeature.FeatureLocation(int(integron_desc.loc[i].pos_beg) - 1,
                                                                                int(integron_desc.loc[i].pos_end)),
                                            strand=integron_desc.loc[i].strand,
                                            type=integron_desc.loc[i].type_elt,
                                            qualifiers={integron_desc.loc[i].type_elt: integron_desc.loc[i].element,
                                                        "model": integron_desc.loc[i].model}
                                            )

                replicon.features.append(tmp)

        else:
            type_integron = integron_desc.loc[i].type.values[0]
            # Should only be true if integron over edge of replicon:
            diff = integron_desc.loc[i].pos_beg.diff() > dist_threshold

            if diff.any():
                pos = np.where(diff)[0][0]
                start_integron_1 = int(integron_desc.loc[i].pos_beg.values[pos])
                end_integron_1 = len(replicon)
                start_integron_2 = 1
                end_integron_2 = int(integron_desc.loc[i].pos_end.values[pos-1])

                f1 = SeqFeature.FeatureLocation(start_integron_1 - 1, end_integron_1)
                f2 = SeqFeature.FeatureLocation(start_integron_2 - 1, end_integron_2)
                tmp = SeqFeature.SeqFeature(location=f1 + f2,
                                            strand=0,
                                            type="integron",
                                            qualifiers={"integron_id": i, "integron_type": type_integron}
                                            )
            else:
                start_integron = int(integron_desc.loc[i].pos_beg.values[0])
                end_integron = int(integron_desc.loc[i].pos_end.values[-1])

                tmp = SeqFeature.SeqFeature(location=SeqFeature.FeatureLocation(start_integron - 1, end_integron),
                                            strand=0,
                                            type="integron",
                                            qualifiers={"integron_id": i, "integron_type": type_integron}
                                            )
            replicon.features.append(tmp)
            for r in integron_desc.loc[i].iterrows():
                if r[1].type_elt == "protein":
                    tmp = SeqFeature.SeqFeature(location=SeqFeature.FeatureLocation(int(r[1].pos_beg) - 1,
                                                                                    int(r[1].pos_end)),
                                                strand=r[1].strand,
                                                type="CDS" if r[1].annotation != "intI" else "integrase",
                                                qualifiers={"protein_id": r[1].element,
                                                            "gene": r[1].annotation,
                                                            "model": r[1].model}
                                                )
                    tmp.qualifiers["translation"] = [prot_db[prt_id] for prt_id in prot_db
                                                     if prt_id == r[1].element][0].seq
                    replicon.features.append(tmp)
                else:
                    tmp = SeqFeature.SeqFeature(location=SeqFeature.FeatureLocation(int(r[1].pos_beg) - 1,
                                                                                    int(r[1].pos_end)),
                                                strand=r[1].strand,
                                                type=r[1].type_elt,
                                                qualifiers={r[1].type_elt: r[1].element, "model": r[1].model}
                                                )

                    replicon.features.append(tmp)

    # We get a ValueError otherwise, eg:
    # ValueError: Locus identifier 'gi|00000000|gb|XX123456.2|' is too long
    if len(replicon.name) > 16:
        replicon.name = replicon.name[-16:]