Module hocort.aligners.biobloom
Expand source code
import logging
import os
import sys
import hocort.execute as exe
from hocort.parse.parser import ArgParser
from hocort.parse.parser import validate_args
logger = logging.getLogger(__file__)
class BioBloom():
"""
BioBloom implementation of the Classifier abstract base class.
"""
def build_index(self, path_out, fasta_in, threads=1, options=[], **kwargs):
"""
Builds an index.
Parameters
----------
path_out : string
Path where the output index is written.
fasta_in : string
Path where the input FASTA file is located.
threads : int
Number of threads to use.
options : list
An options list where additional arguments may be specified.
Returns
-------
[cmd] : list
List of commands to be executed.
Raises
------
ValueError
Raised if no input FASTA file is given, or no output path is given.
If disallowed characters are found in input.
"""
# validate input
valid, arg, chars = validate_args([path_out, fasta_in] + options)
if not valid:
raise ValueError(f'Input with disallowed characters detected: "{arg}" - {chars}')
if not fasta_in:
raise ValueError(f'No input FASTA file was given.')
# should probably check whether output folder exists already here
if not path_out:
raise ValueError(f'No output path was given.')
cmd = ['biobloommaker', '-t', str(threads), '-p', 'reference', fasta_in, '-o', path_out]
return [cmd]
def classify(self, index, seq1, out, seq2=None, threads=1, options=[]):
"""
Matches sequences to a reference database and classifies them.
Parameters
----------
index : string
Path where the output index is written.
seq1 : string
Path where the first input FastQ file is located.
out : string
Path (path/prefix) where the output FastQ files will be written.
seq2 : string
Path where the second input FastQ file is located.
threads : int
Number of threads to use.
options : list
An options list where additional arguments may be specified.
Returns
-------
[cmd] : list
List of commands to be executed.
Raises
------
ValueError
Raised if no input index path is given, or no input FastQ file is given.
If disallowed characters are found in input.
"""
# validate input
valid, arg, chars = validate_args([index, seq1, out, seq2] + options)
if not valid:
raise ValueError(f'Input with disallowed characters detected: "{arg}" - {chars}')
if not index:
raise ValueError(f'No index path was given.')
if not seq1:
raise ValueError(f'No input FastQ was given.')
cmd = ['biobloomcategorizer', '-t', str(threads), '-f', index, '--fq', '-p', out]
if seq2:
cmd += ['--paired_mode', seq1, seq2]
else: cmd += [seq1]
cmd += options
return [cmd]
def index_interface(self, args):
"""
Main function for the index generation interface. Parses arguments and generates the index.
Parameters
----------
args : list
This list is parsed by ArgumentParser.
Returns
-------
None
"""
parser = ArgParser(
description=f'{self.__class__.__name__} aligner',
usage=f'hocort index {self.__class__.__name__} [-h] [--threads <int>] -i <fasta> -o <index>'
)
parser.add_argument(
'-i',
'--input',
required=True,
type=str,
metavar=('<fasta>'),
help='str: path to sequence files (required)'
)
parser.add_argument(
'-o',
'--output',
required=True,
type=str,
metavar=('<index>'),
help='str: path to output index (dir/basename) (required)'
)
parser.add_argument(
'-t',
'--threads',
required=False,
type=int,
metavar=('<int>'),
default=os.cpu_count(),
help='int: number of threads (default: max available on machine)'
)
parsed = parser.parse_args(args=args)
ref = parsed.input
out = parsed.output
threads = parsed.threads
s = os.path.split(out)
out_dir = s[0]
basename = s[1]
if not os.path.isdir(out_dir):
logger.error(f'Output path does not exist: {out}')
sys.exit(1)
cmd = self.build_index(out, ref, threads=threads)
logger.warning(f'Generating index for: {self.__class__.__name__}')
returncode = exe.execute(cmd, pipe=False, merge_stdout_stderr=True)
return returncode[0]
Classes
class BioBloom
-
BioBloom implementation of the Classifier abstract base class.
Expand source code
class BioBloom(): """ BioBloom implementation of the Classifier abstract base class. """ def build_index(self, path_out, fasta_in, threads=1, options=[], **kwargs): """ Builds an index. Parameters ---------- path_out : string Path where the output index is written. fasta_in : string Path where the input FASTA file is located. threads : int Number of threads to use. options : list An options list where additional arguments may be specified. Returns ------- [cmd] : list List of commands to be executed. Raises ------ ValueError Raised if no input FASTA file is given, or no output path is given. If disallowed characters are found in input. """ # validate input valid, arg, chars = validate_args([path_out, fasta_in] + options) if not valid: raise ValueError(f'Input with disallowed characters detected: "{arg}" - {chars}') if not fasta_in: raise ValueError(f'No input FASTA file was given.') # should probably check whether output folder exists already here if not path_out: raise ValueError(f'No output path was given.') cmd = ['biobloommaker', '-t', str(threads), '-p', 'reference', fasta_in, '-o', path_out] return [cmd] def classify(self, index, seq1, out, seq2=None, threads=1, options=[]): """ Matches sequences to a reference database and classifies them. Parameters ---------- index : string Path where the output index is written. seq1 : string Path where the first input FastQ file is located. out : string Path (path/prefix) where the output FastQ files will be written. seq2 : string Path where the second input FastQ file is located. threads : int Number of threads to use. options : list An options list where additional arguments may be specified. Returns ------- [cmd] : list List of commands to be executed. Raises ------ ValueError Raised if no input index path is given, or no input FastQ file is given. If disallowed characters are found in input. """ # validate input valid, arg, chars = validate_args([index, seq1, out, seq2] + options) if not valid: raise ValueError(f'Input with disallowed characters detected: "{arg}" - {chars}') if not index: raise ValueError(f'No index path was given.') if not seq1: raise ValueError(f'No input FastQ was given.') cmd = ['biobloomcategorizer', '-t', str(threads), '-f', index, '--fq', '-p', out] if seq2: cmd += ['--paired_mode', seq1, seq2] else: cmd += [seq1] cmd += options return [cmd] def index_interface(self, args): """ Main function for the index generation interface. Parses arguments and generates the index. Parameters ---------- args : list This list is parsed by ArgumentParser. Returns ------- None """ parser = ArgParser( description=f'{self.__class__.__name__} aligner', usage=f'hocort index {self.__class__.__name__} [-h] [--threads <int>] -i <fasta> -o <index>' ) parser.add_argument( '-i', '--input', required=True, type=str, metavar=('<fasta>'), help='str: path to sequence files (required)' ) parser.add_argument( '-o', '--output', required=True, type=str, metavar=('<index>'), help='str: path to output index (dir/basename) (required)' ) parser.add_argument( '-t', '--threads', required=False, type=int, metavar=('<int>'), default=os.cpu_count(), help='int: number of threads (default: max available on machine)' ) parsed = parser.parse_args(args=args) ref = parsed.input out = parsed.output threads = parsed.threads s = os.path.split(out) out_dir = s[0] basename = s[1] if not os.path.isdir(out_dir): logger.error(f'Output path does not exist: {out}') sys.exit(1) cmd = self.build_index(out, ref, threads=threads) logger.warning(f'Generating index for: {self.__class__.__name__}') returncode = exe.execute(cmd, pipe=False, merge_stdout_stderr=True) return returncode[0]
Methods
def build_index(self, path_out, fasta_in, threads=1, options=[], **kwargs)
-
Builds an index.
Parameters
path_out
:string
- Path where the output index is written.
fasta_in
:string
- Path where the input FASTA file is located.
threads
:int
- Number of threads to use.
options
:list
- An options list where additional arguments may be specified.
Returns
[cmd] : list List of commands to be executed.
Raises
ValueError
- Raised if no input FASTA file is given, or no output path is given. If disallowed characters are found in input.
Expand source code
def build_index(self, path_out, fasta_in, threads=1, options=[], **kwargs): """ Builds an index. Parameters ---------- path_out : string Path where the output index is written. fasta_in : string Path where the input FASTA file is located. threads : int Number of threads to use. options : list An options list where additional arguments may be specified. Returns ------- [cmd] : list List of commands to be executed. Raises ------ ValueError Raised if no input FASTA file is given, or no output path is given. If disallowed characters are found in input. """ # validate input valid, arg, chars = validate_args([path_out, fasta_in] + options) if not valid: raise ValueError(f'Input with disallowed characters detected: "{arg}" - {chars}') if not fasta_in: raise ValueError(f'No input FASTA file was given.') # should probably check whether output folder exists already here if not path_out: raise ValueError(f'No output path was given.') cmd = ['biobloommaker', '-t', str(threads), '-p', 'reference', fasta_in, '-o', path_out] return [cmd]
def classify(self, index, seq1, out, seq2=None, threads=1, options=[])
-
Matches sequences to a reference database and classifies them.
Parameters
index
:string
- Path where the output index is written.
seq1
:string
- Path where the first input FastQ file is located.
out
:string
- Path (path/prefix) where the output FastQ files will be written.
seq2
:string
- Path where the second input FastQ file is located.
threads
:int
- Number of threads to use.
options
:list
- An options list where additional arguments may be specified.
Returns
[cmd] : list List of commands to be executed.
Raises
ValueError
- Raised if no input index path is given, or no input FastQ file is given. If disallowed characters are found in input.
Expand source code
def classify(self, index, seq1, out, seq2=None, threads=1, options=[]): """ Matches sequences to a reference database and classifies them. Parameters ---------- index : string Path where the output index is written. seq1 : string Path where the first input FastQ file is located. out : string Path (path/prefix) where the output FastQ files will be written. seq2 : string Path where the second input FastQ file is located. threads : int Number of threads to use. options : list An options list where additional arguments may be specified. Returns ------- [cmd] : list List of commands to be executed. Raises ------ ValueError Raised if no input index path is given, or no input FastQ file is given. If disallowed characters are found in input. """ # validate input valid, arg, chars = validate_args([index, seq1, out, seq2] + options) if not valid: raise ValueError(f'Input with disallowed characters detected: "{arg}" - {chars}') if not index: raise ValueError(f'No index path was given.') if not seq1: raise ValueError(f'No input FastQ was given.') cmd = ['biobloomcategorizer', '-t', str(threads), '-f', index, '--fq', '-p', out] if seq2: cmd += ['--paired_mode', seq1, seq2] else: cmd += [seq1] cmd += options return [cmd]
def index_interface(self, args)
-
Main function for the index generation interface. Parses arguments and generates the index.
Parameters
args
:list
- This list is parsed by ArgumentParser.
Returns
None
Expand source code
def index_interface(self, args): """ Main function for the index generation interface. Parses arguments and generates the index. Parameters ---------- args : list This list is parsed by ArgumentParser. Returns ------- None """ parser = ArgParser( description=f'{self.__class__.__name__} aligner', usage=f'hocort index {self.__class__.__name__} [-h] [--threads <int>] -i <fasta> -o <index>' ) parser.add_argument( '-i', '--input', required=True, type=str, metavar=('<fasta>'), help='str: path to sequence files (required)' ) parser.add_argument( '-o', '--output', required=True, type=str, metavar=('<index>'), help='str: path to output index (dir/basename) (required)' ) parser.add_argument( '-t', '--threads', required=False, type=int, metavar=('<int>'), default=os.cpu_count(), help='int: number of threads (default: max available on machine)' ) parsed = parser.parse_args(args=args) ref = parsed.input out = parsed.output threads = parsed.threads s = os.path.split(out) out_dir = s[0] basename = s[1] if not os.path.isdir(out_dir): logger.error(f'Output path does not exist: {out}') sys.exit(1) cmd = self.build_index(out, ref, threads=threads) logger.warning(f'Generating index for: {self.__class__.__name__}') returncode = exe.execute(cmd, pipe=False, merge_stdout_stderr=True) return returncode[0]