@inproceedings{215346e3988340198f30fa7d72c35342,
title = "When technology meets technology: Retrained 'Inception V3' classifier for NGS based pathogen detection",
abstract = "Accurate characterization of pathogenic microbes that may be present in food or clinical samples is essential in the design of appropriate intervention strategies. Inherent genomic patterns (codon-biases and rate of evolution) do simplify the classification of microbes at most taxonomic levels (genus and above), but mostly blur classification at Species/Strain levels. Hence, their classification at these finer taxonomic levels requires high-resolution genomic-data that provide SNP (Single Nucleotide Polymorphism) level precision. Existing classification methods involve either targeted amplification of sero-specific genes (serotyping and MLST) or sequencing of the entire microbial genome, both of which require extra time and resources. We present a computational approach, which harnesses the power of the metagenomic NGS-data and object-detection abilities of Convolutional Neural Networks (CNN)(Inception V3), for precise classification of pathogens by converting genomic-data (NGS-reads) into images (nucleotide-by-color). A small scale retraining (<50 images/class) of 'Inception V3' resulted in a classifier with 100% and 96% validation and test accuracies, respectively, when classifying pathogens such as Campylobacter coli/jejuni and Escherichia coli (O157:H7 and Non O157-STECs). We aim to extend this protocol to the detection of several microbes (multiple-objects) in a metagenomic image (genomic image of an entire microbial community).",
keywords = "Deep learning, NGS, Pathogen detection, TensorFlow",
author = "Rohita Sinha and Jennifer Clarke",
note = "Funding Information: One of the important steps of our protocol was the detection of shared genomic regions (homologous regions) across all the members of a microbial clade (species or strain). A small fraction of these regions were later used to generate the genomic images for training and testing the classifiers. However, detection of shared genomic regions can be computationally exhaustive, since the number of DNA alignment operations increases with an increase in the number of genomes within a clade. To account for such limitations we have divided the alignment operations into multiple independent tasks and performed these steps on a high-throughput distributed computing platform (Open Science Grid, OSG). OSG is supported by National Science Foundation and Department of Energy and has emerged as one of the essential computational resources for the scientific community. Publisher Copyright: {\textcopyright} 2017 IEEE.; 2017 IEEE International Conference on Bioinformatics and Biomedicine, BIBM 2017 ; Conference date: 13-11-2017 Through 16-11-2017",
year = "2017",
month = dec,
day = "15",
doi = "10.1109/BIBM.2017.8217942",
language = "English (US)",
series = "Proceedings - 2017 IEEE International Conference on Bioinformatics and Biomedicine, BIBM 2017",
publisher = "Institute of Electrical and Electronics Engineers Inc.",
pages = "1--5",
editor = "Illhoi Yoo and Zheng, {Jane Huiru} and Yang Gong and Hu, {Xiaohua Tony} and Chi-Ren Shyu and Yana Bromberg and Jean Gao and Dmitry Korkin",
booktitle = "Proceedings - 2017 IEEE International Conference on Bioinformatics and Biomedicine, BIBM 2017",
}