diff --git a/.Rbuildignore b/.Rbuildignore new file mode 100644 index 0000000..91114bf --- /dev/null +++ b/.Rbuildignore @@ -0,0 +1,2 @@ +^.*\.Rproj$ +^\.Rproj\.user$ diff --git a/.Rproj.user/39CB7C5D/cpp-definition-cache b/.Rproj.user/39CB7C5D/cpp-definition-cache new file mode 100644 index 0000000..0637a08 --- /dev/null +++ b/.Rproj.user/39CB7C5D/cpp-definition-cache @@ -0,0 +1 @@ +[] \ No newline at end of file diff --git a/.Rproj.user/39CB7C5D/pcs/files-pane.pper b/.Rproj.user/39CB7C5D/pcs/files-pane.pper new file mode 100644 index 0000000..4f335ba --- /dev/null +++ b/.Rproj.user/39CB7C5D/pcs/files-pane.pper @@ -0,0 +1,9 @@ +{ + "sortOrder": [ + { + "columnIndex": 2, + "ascending": true + } + ], + "path": "~/Github/seqsender/docs/articles" +} \ No newline at end of file diff --git a/.Rproj.user/39CB7C5D/pcs/source-pane.pper b/.Rproj.user/39CB7C5D/pcs/source-pane.pper new file mode 100644 index 0000000..6a747e8 --- /dev/null +++ b/.Rproj.user/39CB7C5D/pcs/source-pane.pper @@ -0,0 +1,4 @@ +{ + "activeTab": 1, + "activeTabSourceWindow0": 2 +} \ No newline at end of file diff --git a/.Rproj.user/39CB7C5D/pcs/windowlayoutstate.pper b/.Rproj.user/39CB7C5D/pcs/windowlayoutstate.pper new file mode 100644 index 0000000..bc98681 --- /dev/null +++ b/.Rproj.user/39CB7C5D/pcs/windowlayoutstate.pper @@ -0,0 +1,14 @@ +{ + "left": { + "splitterpos": 315, + "topwindowstate": "NORMAL", + "panelheight": 892, + "windowheight": 966 + }, + "right": { + "splitterpos": 580, + "topwindowstate": "NORMAL", + "panelheight": 892, + "windowheight": 966 + } +} \ No newline at end of file diff --git a/.Rproj.user/39CB7C5D/pcs/workbench-pane.pper b/.Rproj.user/39CB7C5D/pcs/workbench-pane.pper new file mode 100644 index 0000000..75e70e9 --- /dev/null +++ b/.Rproj.user/39CB7C5D/pcs/workbench-pane.pper @@ -0,0 +1,5 @@ +{ + "TabSet1": 0, + "TabSet2": 0, + "TabZoom": {} +} \ No newline at end of file diff --git a/.Rproj.user/39CB7C5D/persistent-state b/.Rproj.user/39CB7C5D/persistent-state new file mode 100644 index 0000000..8dfdf4d --- /dev/null +++ b/.Rproj.user/39CB7C5D/persistent-state @@ -0,0 +1,11 @@ +activeClientUrl="http://localhost:8787/" +build-last-errors="[]" +build-last-errors-base-dir="" +build-last-outputs="[]" +compile_pdf_state="{\"tab_visible\":false,\"running\":false,\"target_file\":\"\",\"output\":\"\",\"errors\":[]}" +displayName="snu3" +files.monitored-path="" +find-in-files-state="{\"handle\":\"\",\"input\":\"\",\"path\":\"\",\"regex\":false,\"ignoreCase\":false,\"results\":{\"file\":[],\"line\":[],\"lineValue\":[],\"matchOn\":[],\"matchOff\":[],\"replaceMatchOn\":[],\"replaceMatchOff\":[]},\"running\":false,\"replace\":false,\"preview\":false,\"gitFlag\":false,\"replacePattern\":\"\"}" +imageDirtyState="1" +portToken="a60fe80fe2a7" +saveActionState="-1" diff --git a/.Rproj.user/39CB7C5D/saved_source_markers b/.Rproj.user/39CB7C5D/saved_source_markers new file mode 100644 index 0000000..2b1bef1 --- /dev/null +++ b/.Rproj.user/39CB7C5D/saved_source_markers @@ -0,0 +1 @@ +{"active_set":"","sets":[]} \ No newline at end of file diff --git a/.Rproj.user/39CB7C5D/sources/prop/0476E19C b/.Rproj.user/39CB7C5D/sources/prop/0476E19C new file mode 100644 index 0000000..52d4fa5 --- /dev/null +++ b/.Rproj.user/39CB7C5D/sources/prop/0476E19C @@ -0,0 +1,6 @@ +{ + "source_window_id": "", + "Source": "Source", + "cursorPosition": "12,0", + "scrollLine": "0" +} \ No newline at end of file diff --git a/.Rproj.user/39CB7C5D/sources/prop/11FFA47D b/.Rproj.user/39CB7C5D/sources/prop/11FFA47D new file mode 100644 index 0000000..68492b8 --- /dev/null +++ b/.Rproj.user/39CB7C5D/sources/prop/11FFA47D @@ -0,0 +1,6 @@ +{ + "source_window_id": "", + "Source": "Source", + "cursorPosition": "18,0", + "scrollLine": "0" +} \ No newline at end of file diff --git a/.Rproj.user/39CB7C5D/sources/prop/133055F3 b/.Rproj.user/39CB7C5D/sources/prop/133055F3 new file mode 100644 index 0000000..3d549f2 --- /dev/null +++ b/.Rproj.user/39CB7C5D/sources/prop/133055F3 @@ -0,0 +1,6 @@ +{ + "source_window_id": "", + "Source": "Source", + "cursorPosition": "18,3", + "scrollLine": "2" +} \ No newline at end of file diff --git a/.Rproj.user/39CB7C5D/sources/prop/145F7FB1 b/.Rproj.user/39CB7C5D/sources/prop/145F7FB1 new file mode 100644 index 0000000..b8bf8c6 --- /dev/null +++ b/.Rproj.user/39CB7C5D/sources/prop/145F7FB1 @@ -0,0 +1,6 @@ +{ + "source_window_id": "", + "Source": "Source", + "cursorPosition": "18,3", + "scrollLine": "0" +} \ No newline at end of file diff --git a/.Rproj.user/39CB7C5D/sources/prop/23CF97DB b/.Rproj.user/39CB7C5D/sources/prop/23CF97DB new file mode 100644 index 0000000..b384e82 --- /dev/null +++ b/.Rproj.user/39CB7C5D/sources/prop/23CF97DB @@ -0,0 +1,6 @@ +{ + "source_window_id": "", + "Source": "Source", + "cursorPosition": "11,0", + "scrollLine": "4" +} \ No newline at end of file diff --git a/.Rproj.user/39CB7C5D/sources/prop/31DDB8C3 b/.Rproj.user/39CB7C5D/sources/prop/31DDB8C3 new file mode 100644 index 0000000..b8bf8c6 --- /dev/null +++ b/.Rproj.user/39CB7C5D/sources/prop/31DDB8C3 @@ -0,0 +1,6 @@ +{ + "source_window_id": "", + "Source": "Source", + "cursorPosition": "18,3", + "scrollLine": "0" +} \ No newline at end of file diff --git a/.Rproj.user/39CB7C5D/sources/prop/852CBDEC b/.Rproj.user/39CB7C5D/sources/prop/852CBDEC new file mode 100644 index 0000000..79c8b4d --- /dev/null +++ b/.Rproj.user/39CB7C5D/sources/prop/852CBDEC @@ -0,0 +1,6 @@ +{ + "source_window_id": "", + "Source": "Source", + "cursorPosition": "133,7", + "scrollLine": "128" +} \ No newline at end of file diff --git a/.Rproj.user/39CB7C5D/sources/prop/8D5998D2 b/.Rproj.user/39CB7C5D/sources/prop/8D5998D2 new file mode 100644 index 0000000..f495ea4 --- /dev/null +++ b/.Rproj.user/39CB7C5D/sources/prop/8D5998D2 @@ -0,0 +1,6 @@ +{ + "source_window_id": "", + "Source": "Source", + "cursorPosition": "41,0", + "scrollLine": "11" +} \ No newline at end of file diff --git a/.Rproj.user/39CB7C5D/sources/prop/8D7E8C0C b/.Rproj.user/39CB7C5D/sources/prop/8D7E8C0C new file mode 100644 index 0000000..b8bf8c6 --- /dev/null +++ b/.Rproj.user/39CB7C5D/sources/prop/8D7E8C0C @@ -0,0 +1,6 @@ +{ + "source_window_id": "", + "Source": "Source", + "cursorPosition": "18,3", + "scrollLine": "0" +} \ No newline at end of file diff --git a/.Rproj.user/39CB7C5D/sources/prop/96F7E2CE b/.Rproj.user/39CB7C5D/sources/prop/96F7E2CE new file mode 100644 index 0000000..5a77caf --- /dev/null +++ b/.Rproj.user/39CB7C5D/sources/prop/96F7E2CE @@ -0,0 +1,6 @@ +{ + "source_window_id": "", + "Source": "Source", + "cursorPosition": "11,1", + "scrollLine": "0" +} \ No newline at end of file diff --git a/.Rproj.user/39CB7C5D/sources/prop/A381AFDE b/.Rproj.user/39CB7C5D/sources/prop/A381AFDE new file mode 100644 index 0000000..fba54c1 --- /dev/null +++ b/.Rproj.user/39CB7C5D/sources/prop/A381AFDE @@ -0,0 +1,6 @@ +{ + "source_window_id": "", + "Source": "Source", + "cursorPosition": "135,7", + "scrollLine": "118" +} \ No newline at end of file diff --git a/.Rproj.user/39CB7C5D/sources/prop/CA2A269A b/.Rproj.user/39CB7C5D/sources/prop/CA2A269A new file mode 100644 index 0000000..bb27690 --- /dev/null +++ b/.Rproj.user/39CB7C5D/sources/prop/CA2A269A @@ -0,0 +1,4 @@ +{ + "source_window_id": "", + "Source": "Source" +} \ No newline at end of file diff --git a/.Rproj.user/39CB7C5D/sources/prop/CB36CFA0 b/.Rproj.user/39CB7C5D/sources/prop/CB36CFA0 new file mode 100644 index 0000000..c697bc1 --- /dev/null +++ b/.Rproj.user/39CB7C5D/sources/prop/CB36CFA0 @@ -0,0 +1,6 @@ +{ + "source_window_id": "", + "Source": "Source", + "cursorPosition": "130,7", + "scrollLine": "117" +} \ No newline at end of file diff --git a/.Rproj.user/39CB7C5D/sources/prop/ED898A07 b/.Rproj.user/39CB7C5D/sources/prop/ED898A07 new file mode 100644 index 0000000..965320d --- /dev/null +++ b/.Rproj.user/39CB7C5D/sources/prop/ED898A07 @@ -0,0 +1,6 @@ +{ + "source_window_id": "", + "Source": "Source", + "cursorPosition": "159,7", + "scrollLine": "148" +} \ No newline at end of file diff --git a/.Rproj.user/39CB7C5D/sources/prop/FB37969B b/.Rproj.user/39CB7C5D/sources/prop/FB37969B new file mode 100644 index 0000000..360375b --- /dev/null +++ b/.Rproj.user/39CB7C5D/sources/prop/FB37969B @@ -0,0 +1,6 @@ +{ + "source_window_id": "", + "Source": "Source", + "cursorPosition": "4,0", + "scrollLine": "0" +} \ No newline at end of file diff --git a/.Rproj.user/39CB7C5D/sources/prop/INDEX b/.Rproj.user/39CB7C5D/sources/prop/INDEX new file mode 100644 index 0000000..cd1b937 --- /dev/null +++ b/.Rproj.user/39CB7C5D/sources/prop/INDEX @@ -0,0 +1,21 @@ +~%2FGithub%2Fseqsender%2F.gitignore="11FFA47D" +~%2FGithub%2Fseqsender%2FDESCRIPTION="8D5998D2" +~%2FGithub%2Fseqsender%2FLICENSE="CA2A269A" +~%2FGithub%2Fseqsender%2FNEWS.md="FB37969B" +~%2FGithub%2Fseqsender%2FREADME.Rmd="707D8EA7" +~%2FGithub%2Fseqsender%2F_pkgdown.yml="0476E19C" +~%2FGithub%2Fseqsender%2Fconfig%2Fmain_config.yaml="67478E68" +~%2FGithub%2Fseqsender%2Finst%2FCITATION="96F7E2CE" +~%2FGithub%2Fseqsender%2Fvignettes%2Fbiosample_submission.Rmd="133055F3" +~%2FGithub%2Fseqsender%2Fvignettes%2Fcompose_installation.Rmd="852CBDEC" +~%2FGithub%2Fseqsender%2Fvignettes%2Fdocker_installation.Rmd="CB36CFA0" +~%2FGithub%2Fseqsender%2Fvignettes%2Ffaqs.Rmd="DE820693" +~%2FGithub%2Fseqsender%2Fvignettes%2Fgenbank_submission.Rmd="8D7E8C0C" +~%2FGithub%2Fseqsender%2Fvignettes%2Fgisaid_cov_submission.Rmd="145F7FB1" +~%2FGithub%2Fseqsender%2Fvignettes%2Fgisaid_flu_submission.Rmd="23CF97DB" +~%2FGithub%2Fseqsender%2Fvignettes%2Fgisaid_options.Rmd="ABEE9B5E" +~%2FGithub%2Fseqsender%2Fvignettes%2Flocal_installation.Rmd="ED898A07" +~%2FGithub%2Fseqsender%2Fvignettes%2Fprerequisites.Rmd="3EA705BD" +~%2FGithub%2Fseqsender%2Fvignettes%2Fsingularity_installation.Rmd="A381AFDE" +~%2FGithub%2Fseqsender%2Fvignettes%2Fsra_options.Rmd="D3D73A1D" +~%2FGithub%2Fseqsender%2Fvignettes%2Fsra_submission.Rmd="31DDB8C3" diff --git a/.Rproj.user/39CB7C5D/sources/session-644ed55e/042D06D7 b/.Rproj.user/39CB7C5D/sources/session-644ed55e/042D06D7 new file mode 100644 index 0000000..b96f528 --- /dev/null +++ b/.Rproj.user/39CB7C5D/sources/session-644ed55e/042D06D7 @@ -0,0 +1,26 @@ +{ + "id": "042D06D7", + "path": "~/Github/seqsender/NEWS.md", + "project_path": "NEWS.md", + "type": "markdown", + "hash": "427892802", + "contents": "", + "dirty": false, + "created": 1707162056710.0, + "source_on_save": false, + "relative_order": 12, + "properties": { + "source_window_id": "", + "Source": "Source", + "cursorPosition": "4,0", + "scrollLine": "0" + }, + "folds": "", + "lastKnownWriteTime": 1707499282, + "encoding": "UTF-8", + "collab_server": "", + "source_window": "", + "last_content_update": 1707499282031, + "read_only": false, + "read_only_alternatives": [] +} \ No newline at end of file diff --git a/.Rproj.user/39CB7C5D/sources/session-644ed55e/042D06D7-contents b/.Rproj.user/39CB7C5D/sources/session-644ed55e/042D06D7-contents new file mode 100644 index 0000000..b96c15c --- /dev/null +++ b/.Rproj.user/39CB7C5D/sources/session-644ed55e/042D06D7-contents @@ -0,0 +1,4 @@ + +# seqsender 1.1.0 +* Github Repo: https://github.com/CDCgov/seqsender +* Documentation: https://cdcgov.github.io/seqsender diff --git a/.Rproj.user/39CB7C5D/sources/session-644ed55e/0CBC917E b/.Rproj.user/39CB7C5D/sources/session-644ed55e/0CBC917E new file mode 100644 index 0000000..e69a3b4 --- /dev/null +++ b/.Rproj.user/39CB7C5D/sources/session-644ed55e/0CBC917E @@ -0,0 +1,26 @@ +{ + "id": "0CBC917E", + "path": "~/Github/seqsender/vignettes/compose_installation.Rmd", + "project_path": "vignettes/compose_installation.Rmd", + "type": "r_markdown", + "hash": "1875304587", + "contents": "", + "dirty": false, + "created": 1707157109242.0, + "source_on_save": false, + "relative_order": 9, + "properties": { + "source_window_id": "", + "Source": "Source", + "cursorPosition": "133,7", + "scrollLine": "128" + }, + "folds": "", + "lastKnownWriteTime": 1707512280, + "encoding": "UTF-8", + "collab_server": "", + "source_window": "", + "last_content_update": 1707512280458, + "read_only": false, + "read_only_alternatives": [] +} \ No newline at end of file diff --git a/.Rproj.user/39CB7C5D/sources/session-644ed55e/0CBC917E-contents b/.Rproj.user/39CB7C5D/sources/session-644ed55e/0CBC917E-contents new file mode 100644 index 0000000..5a0a7c7 --- /dev/null +++ b/.Rproj.user/39CB7C5D/sources/session-644ed55e/0CBC917E-contents @@ -0,0 +1,535 @@ +--- +output: rmarkdown::html_document +title: "How to run seqsender with Compose" +vignette: > + %\VignetteIndexEntry{How to run seqsender with Compose} + %\VignetteEngine{knitr::rmarkdown} + %\VignetteEncoding{UTF-8} +--- + +```{r, include=FALSE, echo=FALSE, message=FALSE, warning=FALSE} +# R libraries +library(yaml) # for yaml file + +# Read in the DESCRIPTION file +description <- yaml::read_yaml("../DESCRIPTION") + +# Define variables +program <- description$Package + +# Define github repo +github_repo <- description$URL + +# Define github pages URL +github_pages_url <- description$GITHUB_PAGES +``` + + + +**SOFTWARE REQUIREMENTS:** + +- Linux (64-bit) or Mac OS X (64-bit) +- Git version 2.25.1 or later +- Docker version 20.10.14 or later +- Docker Compose version 2.21 or later +- Standard utilities: curl, tar, unzip + +**ADDITIONAL REQUIREMENTS:** + +See [PRE-REQUISITES](`r github_pages_url`/index.html#prerequisites) and [REQUIREMENT FILES](`r github_pages_url`/index.html#requirement-files) before proceeding to the next steps + +### (1) Clone ``r program`` repo to your $HOME directory + +``` bash +cd $HOME +git clone `r github_repo`.git +``` + +### (2) Navigate to ``r program`` folder where `docker-compose.yml` is stored and edit that file to link the data inputs to run ``r program`` + +``` bash +cd `r program` +``` + +Here is a quick look of the `docker-compose.yaml` file: + +```bash +version: "3.9" + +x-data-volumes: + &data-volume + type: bind + source: $HOME + target: /data + +services: + seqsender: + container_name: seqsender + image: cdcgov/seqsender-dev:latest + restart: always + volumes: + - *data-volume + command: tail -f /dev/null +``` + +_**NOTE:** `source` is the storage location of your local machine. This location will be mapped to `/data` directory inside the container. Here we are mounting the local `$HOME` directory to `/data` inside the container._ + +### (3) Start up the ``r program`` container + +```bash +docker-compose up -d +``` + +**`-d`**: run the container in detached mode
+ +For more information about the docker-compose syntax, see docker-compose up reference + + +### (4) Check if the container is running + +``` bash +docker container ps + + +CONTAINER ID IMAGE COMMAND CREATED STATUS PORTS NAMES +b37b6b19c4e8 `r program`:latest "/bin/bash" 5 hours ago Up 5 hours `r program` +``` + +### (5) See a list of commands in ``r program`` container + +``` bash +docker exec -it `r program` bash `r program`-kickoff --help +``` + +**`-t`**: allocate a pseudo-tty
**-i**: keep STDIN open even if not attached
+**`-h`**, **`--help`**: show help messages and exit + +``` bash +usage: `r program`.py [-h] + {prep,submit,check_submission_status,template,version} ... + +Automate the process of batch uploading consensus sequences and metadata to +databases of your choices + +positional arguments: + {prep,submit,check_submission_status,template,version} + +optional arguments: + -h, --help show this help message and exit +``` + +Rather than hastily jump in and submit a `production` submission right away, we can utilize GISAID's and NCBI's **“TEST-SERVER”** to upload a `test` submission first. That way submitter can familiarize themselves with the submission process prior to make a real submission. + +**Note:** Duplicate test submissions will result in an error. Please create new sequence names each time you plan to run test submissions to avoid this issue. + +### Submit a `test` submission with a pre-processed dataset + + + +### Submit a `test` submission with your own dataset + + + +


+ +Any questions or issues? Please report them on our Github issue tracker. + +
+ + + + + diff --git a/.Rproj.user/39CB7C5D/sources/session-644ed55e/11A85592-contents b/.Rproj.user/39CB7C5D/sources/session-644ed55e/11A85592-contents new file mode 100644 index 0000000..e5c780e --- /dev/null +++ b/.Rproj.user/39CB7C5D/sources/session-644ed55e/11A85592-contents @@ -0,0 +1,163 @@ +--- +title: "NCBI - SRA" +output: rmarkdown::html_document +vignette: > + %\VignetteIndexEntry{NCBI - SRA} + %\VignetteEngine{knitr::rmarkdown} + %\VignetteEncoding{UTF-8} +--- + + + + +```{r, include=FALSE, echo=FALSE, message=FALSE, warning=FALSE} +# R libraries +library(knitr) # for html table +library(yaml) # for yaml file +library(tidyverse) # for pipe +library(reshape2) # for data manipulation + +# Read in the DESCRIPTION file +description <- yaml::read_yaml("../DESCRIPTION") + +# Define variables +program <- description$Package +title <- "SRA" +prefix <- "sra-" +prefix_examples <- c("sra-loader", "sra-platform") +portals <- c("NCBI", "NCBI", "NCBI", "GISAID", "GISAID") +databases <- c("BIOSAMPLE", "SRA", "GENBANK", "FLU", "COV") +organism <- c("Influenza A Virus", "SARS-COV-2") +organism_abbrev <- c("FLU", "COV") + +# Define github repo +github_repo <- description$URL + +# Define github pages URL +github_pages_url <- description$GITHUB_PAGES + +# Create main config data frame +main_config_df <- data.frame( + portals = portals, + databases = databases +) %>% +dplyr::filter( + databases %in% toupper(!!title) +) + +# Read in data files +main_config_file <- yaml::read_yaml("../config/main_config.yaml") + +# Store all required fields +metadata_df <- reshape2::melt(main_config_file$SUBMISSION_PORTAL$COMMON_FIELDS) %>% + dplyr::transmute( + Column_name = gsub("[*&?#]", "", L1), + Description = value + ) + +# Combine all fields in given databases and portals +for(d in 1:nrow(main_config_df)){ + #d=1 + database <- main_config_df$databases[d] + portal <- main_config_df$portals[which(main_config_df$databases %in% database)] + + if("COMMON_FIELDS" %in% names(main_config_file$SUBMISSION_PORTAL$PORTAL_NAMES[[portal]])){ + portal_fields <- reshape2::melt(main_config_file$SUBMISSION_PORTAL$PORTAL_NAMES[[portal]]$COMMON_FIELDS) %>% + dplyr::transmute( + Column_name = gsub("[*&?#]", "", L1), + Description = value + ) + + metadata_df <- metadata_df %>% + dplyr::bind_rows(portal_fields) %>% + dplyr::distinct(.keep_all = TRUE) + + } + + database_fields <- reshape2::melt(main_config_file$SUBMISSION_PORTAL$PORTAL_NAMES[[portal]]$DATABASE[[database]]) %>% + dplyr::transmute( + Column_name = gsub("[*&?#]", "", L1), + Description = value + ) + + metadata_df <- metadata_df %>% + dplyr::bind_rows(database_fields) %>% + dplyr::distinct(.keep_all = TRUE) + +} +``` + +## Overview + +**Sequence Read Archive (SRA)** data, available through multiple cloud providers and NCBI servers, is the largest publicly available repository of high throughput sequencing data. The archive accepts data from all branches of life as well as metagenomic and environmental surveys. **SRA** stores raw sequencing data and alignment information to enhance reproducibility and facilitate new discoveries through data analysis. + +Before one can upload sequence read archives to **`r title`** database using ``r program``, they must ensure the requirement files (such as `config.yaml`, `metadata.csv`, `sequence.fasta`, `raw reads`, etc.) are prepared in advance and stored in a submission directory of choice. + +## Requirement files + +- [Config file](#config-file) in a `yaml` format +- [Sequence read archives](#sequence-read-archives) in a `bam/sff/hdf5/fastq` format +- [Metadata file](#metadata-file) in a `csv` format + + +## Config file + +Config file is a yaml file that provides a brief description about the submission and contains user credentials that allow ``r program`` to authenticate the database prior to upload a submission. + +![](images/config_file.png) + +:::{style="padding: 10px; border: 1px solid blue !important;"} + **NOTE:**
+ +- To submit to NCBI only, one can remove the **GISAID Submission (b)** section from the config file. Vice versa, to submit to GISAID only, just remove the **NCBI Submission (a)** section.
+- **Submission_Position** determines the order of databases in which we will submit to first. For instance, if GISAID is set as `Primary`, **_`r program`_** will submit to GISAID first, then after all samples are assigned with a GISAID accession number, **_`r program`_** will proceed to submit to NCBI. This order of submission ensures samples are linked correctly between the two databases.
+- **Username** and **Password** under the **NCBI Submission (b)** section are the credentials used to authenticate the **NCBI FTP Server** (not to mistake with individual NCBI account). See [PRE-REQUISITES](`r github_pages_url`/index.html#prerequisites) for more details. +::: + +## Sequence read archives + +Currently, NCBI accepts binary files such as BAM, SFF, and HDF5 formats and text formats such as FASTQ. See [SRA Submit Formats](https://www.ncbi.nlm.nih.gov/sra/docs/submitformats/) for more details. + +:::{style="padding: 10px; border: 1px solid blue !important;"} +` **NOTE:**
+ +- Sequence read archive for all samples must be stored in a subfolder called `raw_reads` inside a submission directory of choice +::: + +
+ +## Metadata file + +Here is a short description about the fields in the metadata worksheet. + +```{r include=TRUE, echo=FALSE, message=FALSE, warning=FALSE} +knitr::kable(metadata_df, format = "html", row.names = FALSE, escape = FALSE) +``` + +
+ +**NOTE:** The prefix of **“`r prefix`”** is used to identity attributes for **`r title`** submissions + +To include additional attributes to **`r title`** submissions, just append ``r prefix`` in front of the desired attributes, e.g. ``r paste0( prefix_examples, collapse=", ")``, etc. See [SRA metadata section](https://www.ncbi.nlm.nih.gov/sra/docs/submitmeta/) for more details. + +

+ +### [* You are now ready to install ``r program`` and batch upload your submission*](`r github_pages_url`/articles/local_installation.html) + +


+ +Any questions or issues? Please report them on our Github issue tracker. + +
+ diff --git a/.Rproj.user/39CB7C5D/sources/session-644ed55e/11FC1D0D b/.Rproj.user/39CB7C5D/sources/session-644ed55e/11FC1D0D new file mode 100644 index 0000000..23b73b1 --- /dev/null +++ b/.Rproj.user/39CB7C5D/sources/session-644ed55e/11FC1D0D @@ -0,0 +1,26 @@ +{ + "id": "11FC1D0D", + "path": "~/Github/seqsender/vignettes/docker_installation.Rmd", + "project_path": "vignettes/docker_installation.Rmd", + "type": "r_markdown", + "hash": "1058644308", + "contents": "", + "dirty": false, + "created": 1707157092942.0, + "source_on_save": false, + "relative_order": 8, + "properties": { + "source_window_id": "", + "Source": "Source", + "cursorPosition": "130,7", + "scrollLine": "117" + }, + "folds": "", + "lastKnownWriteTime": 1707512280, + "encoding": "UTF-8", + "collab_server": "", + "source_window": "", + "last_content_update": 1707512280138, + "read_only": false, + "read_only_alternatives": [] +} \ No newline at end of file diff --git a/.Rproj.user/39CB7C5D/sources/session-644ed55e/11FC1D0D-contents b/.Rproj.user/39CB7C5D/sources/session-644ed55e/11FC1D0D-contents new file mode 100644 index 0000000..680c19d --- /dev/null +++ b/.Rproj.user/39CB7C5D/sources/session-644ed55e/11FC1D0D-contents @@ -0,0 +1,533 @@ +--- +output: rmarkdown::html_document +title: "How to run seqsender with Docker" +vignette: > + %\VignetteIndexEntry{How to run seqsender with Docker"} + %\VignetteEngine{knitr::rmarkdown} + %\VignetteEncoding{UTF-8} +--- + +```{r, include=FALSE, echo=FALSE, message=FALSE, warning=FALSE} +# R libraries +library(yaml) # for yaml file + +# Read in the DESCRIPTION file +description <- yaml::read_yaml("../DESCRIPTION") + +# Define variables +program <- description$Package + +# Define github repo +github_repo <- description$URL + +# Define github pages URL +github_pages_url <- description$GITHUB_PAGES +``` + + + +**SOFTWARE REQUIREMENTS:** + +- Linux (64-bit) or Mac OS X (64-bit) +- Git version 2.25.1 or later +- Docker version 20.10.14 or later +- Standard utilities: curl, tar, unzip + +**ADDITIONAL REQUIREMENTS:** + +See [PRE-REQUISITES](`r github_pages_url`/index.html#prerequisites) and [REQUIREMENT FILES](`r github_pages_url`/index.html#requirement-files) before proceeding to the next steps + +### (1) Clone ``r program`` repo to your $HOME directory + +``` bash +cd $HOME +git clone `r github_repo`.git +``` + +### (2) Navigate to ``r program`` folder where `Dockerfile` file is stored and build its docker image + +``` bash +cd `r program` +docker build -t `r program`:latest . +``` + +**-t**: add a tag to an image, e.g. *`r program`:1.0.0* or +*`r program`:latest* + +### (3) After the build is completed, you can check if the image is built successfully + +``` bash +docker images + +REPOSITORY TAG IMAGE ID CREATED SIZE +`r program` latest d9e2578d2211 2 weeks ago 581GB +``` + +### (4) Run ``r program`` container + +``` bash +docker run \ +-v $HOME:/data \ +-t -d `r program`:latest \ +--name `r program` +``` + +**`-t`**: allocate a pseudo-tty
+**`-d`**: run the container in detached mode
+**`-v`**: mount data files from host directory to container directory **[host_div]:[container_dir]**. By exposing the host directory to docker container, docker will be able to access data files within that mounted directory and use it to fire up the ``r program``workflows. **NOTE:** Here we are mounting the local `$HOME` directory to `/data` directory inside the container.
+**`--name`**: give an identity to the container
+ +For more information about the Docker syntax, see +Docker +run reference + +To check if the container is built successfully + +``` bash +docker container ps + + +CONTAINER ID IMAGE COMMAND CREATED STATUS PORTS NAMES +b37b6b19c4e8 `r program`:latest "/bin/bash" 5 hours ago Up 5 hours `r program` +``` + +### (5) See a list of commands in ``r program`` container + +``` bash +docker exec -it `r program` bash `r program`-kickoff --help +``` + +**`-t`**: allocate a pseudo-tty
+**`-i`**: keep STDIN open even if not attached
+**`-h`**, **`--help`**: show help messages and exit + +``` bash +usage: `r program`.py [-h] + {prep,submit,check_submission_status,template,version} ... + +Automate the process of batch uploading consensus sequences and metadata to +databases of your choices + +positional arguments: + {prep,submit,check_submission_status,template,version} + +optional arguments: + -h, --help show this help message and exit +``` + +Rather than hastily jump in and submit a `production` submission right away, we can utilize GISAID's and NCBI's **“TEST-SERVER”** to upload a `test` submission first. That way submitter can familiarize themselves with the submission process prior to make a real submission. + +**Note:** Duplicate test submissions will result in an error. Please create new sequence names each time you plan to run test submissions to avoid this issue. + +### Submit a `test` submission with a pre-processed dataset + + + +### Submit a `test` submission with your own dataset + + + +


+ +Any questions or issues? Please report them on our Github issue tracker. + +
+ + + + + diff --git a/.Rproj.user/39CB7C5D/sources/session-644ed55e/23D64F45 b/.Rproj.user/39CB7C5D/sources/session-644ed55e/23D64F45 new file mode 100644 index 0000000..d15748c --- /dev/null +++ b/.Rproj.user/39CB7C5D/sources/session-644ed55e/23D64F45 @@ -0,0 +1,26 @@ +{ + "id": "23D64F45", + "path": "~/Github/seqsender/vignettes/biosample_submission.Rmd", + "project_path": "vignettes/biosample_submission.Rmd", + "type": "r_markdown", + "hash": "1471836763", + "contents": "", + "dirty": false, + "created": 1707160700849.0, + "source_on_save": false, + "relative_order": 3, + "properties": { + "source_window_id": "", + "Source": "Source", + "cursorPosition": "18,3", + "scrollLine": "2" + }, + "folds": "", + "lastKnownWriteTime": 1707516426, + "encoding": "UTF-8", + "collab_server": "", + "source_window": "", + "last_content_update": 1707516426689, + "read_only": false, + "read_only_alternatives": [] +} \ No newline at end of file diff --git a/.Rproj.user/39CB7C5D/sources/session-644ed55e/23D64F45-contents b/.Rproj.user/39CB7C5D/sources/session-644ed55e/23D64F45-contents new file mode 100644 index 0000000..3d5eac6 --- /dev/null +++ b/.Rproj.user/39CB7C5D/sources/session-644ed55e/23D64F45-contents @@ -0,0 +1,155 @@ +--- +title: "NCBI - BioSample" +output: rmarkdown::html_document +vignette: > + %\VignetteIndexEntry{NCBI - BioSample} + %\VignetteEngine{knitr::rmarkdown} + %\VignetteEncoding{UTF-8} +--- + + + +```{r, include=FALSE, echo=FALSE, message=FALSE, warning=FALSE} +# R libraries +library(knitr) # for html table +library(yaml) # for yaml file +library(tidyverse) # for pipe +library(reshape2) # for data manipulation + +# Read in the DESCRIPTION file +description <- yaml::read_yaml("../DESCRIPTION") + +# Define variables +program <- description$Package +title <- "BioSample" +prefix <- "bs-" +prefix_examples <- c("bs-host_age", "bs-host_sex") +portals <- c("NCBI", "NCBI", "NCBI", "GISAID", "GISAID") +databases <- c("BIOSAMPLE", "SRA", "GENBANK", "FLU", "COV") +organism <- c("Influenza A Virus", "SARS-COV-2") +organism_abbrev <- c("FLU", "COV") + +# Define github repo +github_repo <- description$URL + +# Define github pages URL +github_pages_url <- description$GITHUB_PAGES + +# Create main config data frame +main_config_df <- data.frame( + portals = portals, + databases = databases +) %>% +dplyr::filter( + databases %in% toupper(!!title) +) + +# Read in data files +main_config_file <- yaml::read_yaml("../config/main_config.yaml") + +# Store all required fields +metadata_df <- reshape2::melt(main_config_file$SUBMISSION_PORTAL$COMMON_FIELDS) %>% + dplyr::transmute( + Column_name = gsub("[*&?#]", "", L1), + Description = value + ) + +# Combine all fields in given databases and portals +for(d in 1:nrow(main_config_df)){ + #d=1 + database <- main_config_df$databases[d] + portal <- main_config_df$portals[which(main_config_df$databases %in% database)] + + if("COMMON_FIELDS" %in% names(main_config_file$SUBMISSION_PORTAL$PORTAL_NAMES[[portal]])){ + portal_fields <- reshape2::melt(main_config_file$SUBMISSION_PORTAL$PORTAL_NAMES[[portal]]$COMMON_FIELDS) %>% + dplyr::transmute( + Column_name = gsub("[*&?#]", "", L1), + Description = value + ) + + metadata_df <- metadata_df %>% + dplyr::bind_rows(portal_fields) %>% + dplyr::distinct(.keep_all = TRUE) + + } + + database_fields <- reshape2::melt(main_config_file$SUBMISSION_PORTAL$PORTAL_NAMES[[portal]]$DATABASE[[database]]) %>% + dplyr::transmute( + Column_name = gsub("[*&?#]", "", L1), + Description = value + ) + + metadata_df <- metadata_df %>% + dplyr::bind_rows(database_fields) %>% + dplyr::distinct(.keep_all = TRUE) + +} + +``` + +## Overview + +**`r title`** is a database containing aggregated information pertaining to reference samples and samples stored in the [European Bioinformatics Institute](https://www.ebi.ac.uk/) assay databases. + +Before submitters can upload their experimental samples to **`r title`** database using ``r program``, they must ensure the requirement files (such as `config.yaml`, `metadata.csv`, `sequence.fasta`, `raw reads`, etc.) are already prepared ahead of time and stored them in a submission folder of choice (e.g., `submission_name`) within a parent submission directory (e.g., `submission_dir`). That way ``r program`` will be able to scoop up the necessary files in that folder, generate submission files, and then batch uploading them to the submitting database of choices. + +## Requirement files + +- [Config file](#config-file) in a `yaml` format +- [Metadata file](#metadata-file) in a `csv` format + +A quick look of where to store all of the requirement files + +![](images/submission_dir.png) + +### Config file + +Config file is a yaml file that provides a brief description about the submission and contains user credentials that allow ``r program`` to authenticate the database prior to upload a submission. + +![](images/config_file.png) + +:::{style="padding: 10px; border: 1px solid blue !important;"} + **NOTE:**
+ +- To submit to NCBI only, one can remove the **GISAID Submission (b)** section from the config file. Vice versa, to submit to GISAID only, just remove the **NCBI Submission (a)** section.
+- **Submission_Position** determines the order of databases in which we will submit to first. For instance, if GISAID is set as `1`, **_`r program`_** will submit to GISAID first, then after all samples are assigned with a GISAID accession number, **_`r program`_** will proceed to submit to NCBI. This order of submission ensures samples are linked correctly between the two databases.
+- **Username** and **Password** under the **NCBI Submission (b)** section are the credentials used to authenticate the **NCBI FTP Server** (not to mistake with individual NCBI account). See [PRE-REQUISITES](`r github_pages_url`/index.html#prerequisites) for more details. +::: + +### Metadata file + +The metadata worksheet is a comma-delimited (csv) file that contains required attributes that are useful for the rapid analysis and trace back of **`r paste0(organism, collapse=" or ")`** cases. + +Here is a short description about the fields in the metadata worksheet. + +```{r include=TRUE, echo=FALSE, message=FALSE, warning=FALSE} +knitr::kable(metadata_df, format = "html", row.names = FALSE, escape = FALSE) +``` + +
+ +**NOTE:** The prefix of **“`r prefix`”** is used to identity attributes for **`r title`** submissions + +To include additional attributes to **`r title`** submissions, just append ``r prefix`` in front of the desired attributes, e.g. ``r paste0(prefix_examples, collapse=", ")``, etc. See [Pathogen.cl.1.0](https://www.ncbi.nlm.nih.gov/biosample/docs/packages/Pathogen.cl.1.0/) package for more attributes. + +

+ +

[* You are now ready to install ``r program`` and batch upload your submission*](`r github_pages_url`/articles/local_installation.html)

+ +


+ +Any questions or issues? Please report them on our Github issue tracker. + +
+ diff --git a/.Rproj.user/39CB7C5D/sources/session-644ed55e/26EC7DA2-contents b/.Rproj.user/39CB7C5D/sources/session-644ed55e/26EC7DA2-contents new file mode 100644 index 0000000..22e66a1 --- /dev/null +++ b/.Rproj.user/39CB7C5D/sources/session-644ed55e/26EC7DA2-contents @@ -0,0 +1,12 @@ + +citHeader("To cite seqsender in publications use:") + +citEntry( + entry = "manual", + title = "seqsender: Public Database Submission Pipeline", + author = "Dakota Howard, Reina Chau, Peter Cook, Kristine Lacek, Amanda Sullivan, Vikram Setlur, Thomas Stark, Brian Lee, Benjamin Rambo-Martin", + institution = "Centers for Disease Control and Prevention", + address = "1600 Clifton Road NE, Building 21, 8th Floor, Atlanta, Georgia 30333", + year = "2023", + textVersion = "Howard, D. et al. seqsender: Public Database Submission Pipeline." +) \ No newline at end of file diff --git a/.Rproj.user/39CB7C5D/sources/session-644ed55e/44A17F56-contents b/.Rproj.user/39CB7C5D/sources/session-644ed55e/44A17F56-contents new file mode 100644 index 0000000..2369973 --- /dev/null +++ b/.Rproj.user/39CB7C5D/sources/session-644ed55e/44A17F56-contents @@ -0,0 +1,161 @@ +--- +title: "NCBI - GenBank" +output: rmarkdown::html_document +vignette: > + %\VignetteIndexEntry{NCBI - GenBank} + %\VignetteEngine{knitr::rmarkdown} + %\VignetteEncoding{UTF-8} +--- + + + +```{r, include=FALSE, echo=FALSE, message=FALSE, warning=FALSE} +# R libraries +library(knitr) # for html table +library(yaml) # for yaml file +library(tidyverse) # for pipe +library(reshape2) # for data manipulation + +# Read in the DESCRIPTION file +description <- yaml::read_yaml("../DESCRIPTION") + +# Define variables +program <- description$Package +title <- "GenBank" +prefix <- "gb-" +portals <- c("NCBI", "NCBI", "NCBI", "GISAID", "GISAID") +databases <- c("BIOSAMPLE", "SRA", "GENBANK", "FLU", "COV") +organism <- c("Influenza A Virus", "SARS-COV-2") +organism_abbrev <- c("FLU", "COV") + +# Define github repo +github_repo <- description$URL + +# Define github pages URL +github_pages_url <- description$GITHUB_PAGES + +# Create main config data frame +main_config_df <- data.frame( + portals = portals, + databases = databases +) %>% +dplyr::filter( + databases %in% toupper(!!title) +) + +# Read in data files +main_config_file <- yaml::read_yaml("../config/main_config.yaml") + +# Store all required fields +metadata_df <- reshape2::melt(main_config_file$SUBMISSION_PORTAL$COMMON_FIELDS) %>% + dplyr::transmute( + Column_name = gsub("[*&?#]", "", L1), + Description = value + ) + +# Combine all fields in given databases and portals +for(d in 1:nrow(main_config_df)){ + #d=1 + database <- main_config_df$databases[d] + portal <- main_config_df$portals[which(main_config_df$databases %in% database)] + + if("COMMON_FIELDS" %in% names(main_config_file$SUBMISSION_PORTAL$PORTAL_NAMES[[portal]])){ + portal_fields <- reshape2::melt(main_config_file$SUBMISSION_PORTAL$PORTAL_NAMES[[portal]]$COMMON_FIELDS) %>% + dplyr::transmute( + Column_name = gsub("[*&?#]", "", L1), + Description = value + ) + + metadata_df <- metadata_df %>% + dplyr::bind_rows(portal_fields) %>% + dplyr::distinct(.keep_all = TRUE) + + } + + database_fields <- reshape2::melt(main_config_file$SUBMISSION_PORTAL$PORTAL_NAMES[[portal]]$DATABASE[[database]]) %>% + dplyr::transmute( + Column_name = gsub("[*&?#]", "", L1), + Description = value + ) + + metadata_df <- metadata_df %>% + dplyr::bind_rows(database_fields) %>% + dplyr::distinct(.keep_all = TRUE) + +} +``` + +## Overview + +The **GenBank** sequence database is an open access, annotated collection of all publicly available nucleotide sequences and their protein translations. It is produced and maintained by the **National Center for Biotechnology Information** (NCBI; a part of the **National Institutes of Health** in the United States) as part of the **International Nucleotide Sequence Database Collaboration (INSDC)**. + +Before submitters can batch uploading meta- and sequence-data to **`r title`** database using ``r program``, they must ensure the requirement files (such as `config.yaml`, `metadata.csv`, `sequence.fasta`, `raw reads`, etc.) are already prepared ahead of time and stored them in a submission folder of choice (e.g., `submission_name`) within a parent submission directory (e.g., `submission_dir`). That way ``r program`` will be able to scoop up the necessary files in that folder, generate submission files, and then batch uploading them to the submitting database of choices. + +## Requirement files + +- [Config file](#config-file) in a `yaml` format +- [Fasta file](#fasta-file) in a`fasta` format +- [Metadata file](#metadata-file) in a `csv` format + +A quick look of where to store all of the requirement files + +![](images/submission_dir.png) + +### Config file + +Config file is a yaml file that provides a brief description about the submission and contains user credentials that allow ``r program`` to authenticate the database prior to upload a submission. + +![](images/config_file.png) + +:::{style="padding: 10px; border: 1px solid blue !important;"} + + **NOTE:**
+ +- To submit to NCBI only, one can remove the **GISAID Submission (b)** section from the config file. Vice versa, to submit to GISAID only, just remove the **NCBI Submission (a)** section.
+- **Submission_Position** determines the order of databases in which we will submit to first. For instance, if GISAID is set as `1`, **_`r program`_** will submit to GISAID first, then after all samples are assigned with a GISAID accession number, **_`r program`_** will proceed to submit to NCBI. This order of submission ensures samples are linked correctly between the two databases.
+- **Username** and **Password** under the **NCBI Submission (b)** section are the credentials used to authenticate the **NCBI FTP Server** (not to mistake with individual NCBI account). See [PRE-REQUISITES](`r github_pages_url`/index.html#prerequisites) for more details. +::: + +### Fasta file + +Fasta file contains nucleotide sequences for all samples. See [Genbank Fasta Format](https://www.ncbi.nlm.nih.gov/genbank/fastaformat/) for more details. + +### Metadata file + +The metadata worksheet is a comma-delimited (csv) file that contains required attributes that are useful for the rapid analysis and trace back of **`r paste0(organism, collapse=" or ")`** cases. + +Here is a short description about the fields in the metadata worksheet. + +```{r include=TRUE, echo=FALSE, message=FALSE, warning=FALSE} +knitr::kable(metadata_df, format = "html", row.names = FALSE, escape = FALSE) +``` + +
+ +**NOTE:** The prefix of **“`r prefix`”** is used to identity attributes for **`r title`** submissions. The prefix of **“src-”** is used to identity attributes for **Source Information Table**. Likewise, the prefix of **“cmt-”** is used to identity attributes for **Structured Comment Table**. + +To include additional attributes to **Source Information table**, just append `src-` in front of the desired attributes, e.g. `src-subtype`, `src-passage`, etc. See [Genbank Source Table Modifier](https://www.ncbi.nlm.nih.gov/WebSub/html/help/genbank-source-table.html#modifiers) for more details. + +To include additional attributes to **Structured Comment Table**, just append `cmt-` in front of the desired attributes, and most importantly, the fields must be sandwiched between `cmt-StructuredCommentPrefix` and `cmt-StructuredCommentSuffix`. For examples, `cmt-StructuredCommentPrefix`, `cmt-Assembly Method`, `cmt-Coverage`, `...`, `cmt-Sequencing Technology`, `cmt-StructuredCommentSuffix`. See [Genbank Structured Comment](https://www.ncbi.nlm.nih.gov/genbank/structuredcomment/#GenBank) for more details. + +

+ +

[* You are now ready to install ``r program`` and batch upload your submission*](`r github_pages_url`/articles/local_installation.html)

+ +


+ +Any questions or issues? Please report them on our Github issue tracker. + +
+ diff --git a/.Rproj.user/39CB7C5D/sources/session-644ed55e/492378CA-contents b/.Rproj.user/39CB7C5D/sources/session-644ed55e/492378CA-contents new file mode 100644 index 0000000..f8a33cd --- /dev/null +++ b/.Rproj.user/39CB7C5D/sources/session-644ed55e/492378CA-contents @@ -0,0 +1,194 @@ +--- +output: rmarkdown::html_document +title: "GISAID - EpiCoV" +vignette: > + %\VignetteIndexEntry{GISAID - EpiCoV} + %\VignetteEngine{knitr::rmarkdown} + %\VignetteEncoding{UTF-8} +--- + + + +```{r, include=FALSE, echo=FALSE, message=FALSE, warning=FALSE} +# R libraries +library(knitr) # for html table +library(yaml) # for yaml file +library(tidyverse) # for pipe +library(reshape2) # for data manipulation + +# Read in the DESCRIPTION file +description <- yaml::read_yaml("../DESCRIPTION") + +# Define variables +program <- description$Package +title <- "EpiCoV" +prefix <- "gs-" +cli <- "covCLI" +cli_list <- c("EpiFlu", "EpiCoV", "EpiRSV", "EpiArbo") +portals <- c("NCBI", "NCBI", "NCBI", "GISAID", "GISAID") +databases <- c("BIOSAMPLE", "SRA", "GENBANK", "FLU", "COV") +organism <- c("SARS-COV-2") +organism_abbrev <- c("COV") + +# Define github repo +github_repo <- description$URL + +# Define github pages URL +github_pages_url <- description$GITHUB_PAGES + +# Create main config data frame +main_config_df <- data.frame( + portals = portals, + databases = databases +) %>% +dplyr::filter( + databases %in% toupper(!!organism_abbrev) +) + +# Read in data files +main_config_file <- yaml::read_yaml("../config/main_config.yaml") + +# Store all required fields +metadata_df <- reshape2::melt(main_config_file$SUBMISSION_PORTAL$COMMON_FIELDS) %>% + dplyr::transmute( + Column_name = gsub("[*&?#]", "", L1), + Description = value + ) + +# Combine all fields in given databases and portals +for(d in 1:nrow(main_config_df)){ + #d=1 + database <- main_config_df$databases[d] + portal <- main_config_df$portals[which(main_config_df$databases %in% database)] + + if("COMMON_FIELDS" %in% names(main_config_file$SUBMISSION_PORTAL$PORTAL_NAMES[[portal]])){ + portal_fields <- reshape2::melt(main_config_file$SUBMISSION_PORTAL$PORTAL_NAMES[[portal]]$COMMON_FIELDS) %>% + dplyr::transmute( + Column_name = gsub("[*&?#]", "", L1), + Description = value + ) + + metadata_df <- metadata_df %>% + dplyr::bind_rows(portal_fields) %>% + dplyr::distinct(.keep_all = TRUE) + + } + + database_fields <- reshape2::melt(main_config_file$SUBMISSION_PORTAL$PORTAL_NAMES[[portal]]$DATABASE[[database]]) %>% + dplyr::transmute( + Column_name = gsub("[*&?#]", "", L1), + Description = value + ) + + metadata_df <- metadata_df %>% + dplyr::bind_rows(database_fields) %>% + dplyr::distinct(.keep_all = TRUE) + +} + +optional_attributes_df <- read.csv("./data/cov_metadata_optional_fields.csv", header=TRUE) + +``` + +## Overview + +**GISAID**, short for the **Global Initiative on Sharing All Influenza Data**, is an organization that manages a restricted-access database containing genomic sequence data of select virus, primarily influenza viruses. The database has expanded to include the coronavirus responsible for the COVID-19 pandemic as well as other pathogens. + +## Prerequisites + +For all GISAID submissions, ``r program`` makes use of GISAID's Command Line Interface Tools (CLIs) to batch uploading meta- and sequence-data to their databases. Prior to perform a batch upload to **`r title` database**, submitters must + +1. Download the **`r paste(title, "CLI")`** package from the **GISAID Platform** that is compatible with their machine (e.g., Linux, macOS, or Windows). + +![](images/`r cli`_download.png) +![](images/`r cli`_download_2.png) + + +
+ +2. Unzip the downloaded package and store it in a subfolder called **`gisaid_cli`** within a submission directory of choice (e.g., `submission_dir`). + +![](images/gisaid_cli_dir.png) + +
+ +## Requirement files + +After submitters had obtained the **GISAID CLI** for **`r title`**, they must also prepare the requirement files (such as `config.yaml`, `metadata.csv`, `sequence.fasta`, `raw reads`, etc.) and store them in a submission folder of choice (e.g., `submission_name`) within a parent submission directory (e.g., `submission_dir`). That way ``r program`` will be able to scoop up the necessary files in that folder, generate submission files, and then batch uploading them to the submitting database of choices. + +Here is a list of the requirement files and where to store them: + +- [Config file](#config-file) in a `yaml` format +- [Fasta file](#fasta-file) in a`fasta` format +- [Metadata file](#metadata-file) in a `csv` format + + +![](images/submission_dir.png) + +### Config file + +Config file is a yaml file that provides a brief description about the submission and contains user credentials that allow ``r program`` to authenticate the database prior to upload a submission. + +![](images/config_file.png) + +:::{style="padding: 10px; border: 1px solid blue !important;"} + **NOTE:**
+ +- To submit to NCBI only, one can remove the **GISAID Submission (b)** section from the config file. Vice versa, to submit to GISAID only, just remove the **NCBI Submission (a)** section.
+- **Submission_Position** determines the order of databases in which we will submit to first. For instance, if GISAID is set as `1`, **_`r program`_** will submit to GISAID first, then after all samples are assigned with a GISAID accession number, **_`r program`_** will proceed to submit to NCBI. This order of submission ensures samples are linked correctly between the two databases.
+- **Username** and **Password** under the **NCBI Submission (b)** section are the credentials used to authenticate the **NCBI FTP Server** (not to mistake with individual NCBI account). See [PRE-REQUISITES](`r github_pages_url`/index.html#prerequisites) for more details. +::: + +### Fasta file + +Fasta file contains nucleotide sequences for all samples. See [Genbank Fasta Format](https://www.ncbi.nlm.nih.gov/genbank/fastaformat/) for more details. + +![](images/`r cli`_fasta.png) + +### Metadata file + +The metadata worksheet is a comma-delimited (csv) file that contains required attributes that are useful for the rapid analysis and trace back of **`r paste0(organism, collapse=" or ")`** cases. + +Here is a short description about the fields in the metadata worksheet. + +```{r include=TRUE, echo=FALSE, message=FALSE, warning=FALSE} +knitr::kable(metadata_df, format = "html", row.names = FALSE, escape = FALSE) +``` + +
+ +**NOTE:** The prefix of **“`r prefix`”** is used to identity attributes for **GISAID** submissions. + +
+ +#### Optional Attributes + +To include additional attributes to **`r title`** submissions, just append ``r prefix`` in front of the desired attributes. Here is a list of optional attributes: + +```{r include=TRUE, echo=FALSE, message=FALSE, warning=FALSE} +knitr::kable(optional_attributes_df, format = "html", row.names = FALSE, escape = FALSE) +``` + +
+ +

+ +

[* You are now ready to install ``r program`` and batch upload your submission*](`r github_pages_url`/articles/local_installation.html)

+ +


+ +Any questions or issues? Please report them on our Github issue tracker. + +
+ diff --git a/.Rproj.user/39CB7C5D/sources/session-644ed55e/49C60C80 b/.Rproj.user/39CB7C5D/sources/session-644ed55e/49C60C80 new file mode 100644 index 0000000..7e038a1 --- /dev/null +++ b/.Rproj.user/39CB7C5D/sources/session-644ed55e/49C60C80 @@ -0,0 +1,26 @@ +{ + "id": "49C60C80", + "path": "~/Github/seqsender/vignettes/gisaid_flu_submission.Rmd", + "project_path": "vignettes/gisaid_flu_submission.Rmd", + "type": "r_markdown", + "hash": "0", + "contents": "", + "dirty": false, + "created": 1707160572458.0, + "source_on_save": false, + "relative_order": 2, + "properties": { + "source_window_id": "", + "Source": "Source", + "cursorPosition": "11,0", + "scrollLine": "4" + }, + "folds": "", + "lastKnownWriteTime": 1707516365, + "encoding": "UTF-8", + "collab_server": "", + "source_window": "", + "last_content_update": 1707516365012, + "read_only": false, + "read_only_alternatives": [] +} \ No newline at end of file diff --git a/.Rproj.user/39CB7C5D/sources/session-644ed55e/49C60C80-contents b/.Rproj.user/39CB7C5D/sources/session-644ed55e/49C60C80-contents new file mode 100644 index 0000000..4e7cd51 --- /dev/null +++ b/.Rproj.user/39CB7C5D/sources/session-644ed55e/49C60C80-contents @@ -0,0 +1,189 @@ +--- +output: rmarkdown::html_document +title: "GISAID - EpiFlu" +vignette: > + %\VignetteIndexEntry{GISAID - EpiFlu} + %\VignetteEngine{knitr::rmarkdown} + %\VignetteEncoding{UTF-8} +--- + + + +```{r, include=FALSE, echo=FALSE, message=FALSE, warning=FALSE} +# R libraries +library(knitr) # for html table +library(yaml) # for yaml file +library(tidyverse) # for pipe +library(reshape2) # for data manipulation + +# Read in the DESCRIPTION file +description <- yaml::read_yaml("../DESCRIPTION") + +# Define variables +program <- description$Package +title <- "EpiFlu" +prefix <- "gs-" +cli <- "fluCLI" +cli_list <- c("EpiFlu", "EpiCoV", "EpiRSV", "EpiArbo") +portals <- c("NCBI", "NCBI", "NCBI", "GISAID", "GISAID") +databases <- c("BIOSAMPLE", "SRA", "GENBANK", "FLU", "COV") +organism <- c("Influenza A Virus") +organism_abbrev <- c("FLU") + +# Define github repo +github_repo <- description$URL + +# Define github pages URL +github_pages_url <- description$GITHUB_PAGES + +# Create main config data frame +main_config_df <- data.frame( + portals = portals, + databases = databases +) %>% +dplyr::filter( + databases %in% toupper(!!organism_abbrev) +) + +# Read in data files +main_config_file <- yaml::read_yaml("../config/main_config.yaml") + +# Store all required fields +metadata_df <- reshape2::melt(main_config_file$SUBMISSION_PORTAL$COMMON_FIELDS) %>% + dplyr::transmute( + Column_name = gsub("[*&?#]", "", L1), + Description = value + ) + +# Combine all fields in given databases and portals +for(d in 1:nrow(main_config_df)){ + #d=1 + database <- main_config_df$databases[d] + portal <- main_config_df$portals[which(main_config_df$databases %in% database)] + + if("COMMON_FIELDS" %in% names(main_config_file$SUBMISSION_PORTAL$PORTAL_NAMES[[portal]])){ + portal_fields <- reshape2::melt(main_config_file$SUBMISSION_PORTAL$PORTAL_NAMES[[portal]]$COMMON_FIELDS) %>% + dplyr::transmute( + Column_name = gsub("[*&?#]", "", L1), + Description = value + ) + + metadata_df <- metadata_df %>% + dplyr::bind_rows(portal_fields) %>% + dplyr::distinct(.keep_all = TRUE) + + } + + database_fields <- reshape2::melt(main_config_file$SUBMISSION_PORTAL$PORTAL_NAMES[[portal]]$DATABASE[[database]]) %>% + dplyr::transmute( + Column_name = gsub("[*&?#]", "", L1), + Description = value + ) + + metadata_df <- metadata_df %>% + dplyr::bind_rows(database_fields) %>% + dplyr::distinct(.keep_all = TRUE) + +} + +optional_attributes_df <- read.csv("./data/flu_metadata_optional_fields.csv", header=TRUE) + +``` + +## Overview + +**GISAID**, short for the **Global Initiative on Sharing All Influenza Data**, is an organization that manages a restricted-access database containing genomic sequence data of select virus, primarily influenza viruses. The database has expanded to include the coronavirus responsible for the COVID-19 pandemic as well as other pathogens. + +## Prerequisites + +For all GISAID submissions, ``r program`` makes use of GISAID's Command Line Interface Tools (CLIs) to batch uploading meta- and sequence-data to their databases. Prior to perform a batch upload to **`r title` database**, submitters must + +1. Download the **`r paste(title, "CLI")`** package from the **GISAID Platform** that is compatible with their machine (e.g., Linux, macOS, or Windows). + + +![](images/`r cli`_download.png) + +
+ +2. Unzip the downloaded package and store it in a subfolder called **`gisaid_cli`** within a submission directory of choice (e.g., `submission_dir`). + +![](images/gisaid_cli_dir.png) + +
+ +## Requirement files + +After submitters had obtained the **GISAID CLI** for **`r title`**, they must also prepare the requirement files (such as `config.yaml`, `metadata.csv`, `sequence.fasta`, `raw reads`, etc.) and store them in a submission foler of choice (e.g., `submission_name`) within a parent submission directory (e.g., `submission_dir`). That way ``r program`` will be able to scoop up the necessary files in that folder, generate submission files, and then batch uploading them to the submitting database of choices. + +Here is a list of the requirement files and where to store them: + +- [Config file](#config-file) in a `yaml` format +- [Fasta file](#fasta-file) in a`fasta` format +- [Metadata file](#metadata-file) in a `csv` format + +![](images/submission_dir.png) + +### Config file + +Config file is a yaml file that provides a brief description about the submission and contains user credentials that allow ``r program`` to authenticate the database prior to upload a submission. + +![](images/config_file.png) + +:::{style="padding: 10px; border: 1px solid blue !important;"} + **NOTE:**
+- To submit to NCBI only, one can remove the **GISAID Submission (b)** section from the config file. Vice versa, to submit to GISAID only, just remove the **NCBI Submission (a)** section.
+- **Submission_Position** determines the order of databases in which we will submit to first. For instance, if GISAID is set as `1`, **_`r program`_** will submit to GISAID first, then after all samples are assigned with a GISAID accession number, **_`r program`_** will proceed to submit to NCBI. This order of submission ensures samples are linked correctly between the two databases.
+- **Username** and **Password** under the **NCBI Submission (b)** section are the credentials used to authenticate the **NCBI FTP Server** (not to mistake with individual NCBI account). See [PRE-REQUISITES](`r github_pages_url`/index.html#prerequisites) for more details. +::: + +### Fasta file + +Fasta file contains nucleotide sequences for all samples. See [Genbank Fasta Format](https://www.ncbi.nlm.nih.gov/genbank/fastaformat/) for more details. + +![](images/`r cli`_fasta.png) + +### Metadata file + +The metadata worksheet is a comma-delimited (csv) file that contains required attributes that are useful for the rapid analysis and trace back of **`r paste0(organism, collapse=" or ")`** cases. + +Here is a short description about the fields in the metadata worksheet. + +```{r include=TRUE, echo=FALSE, message=FALSE, warning=FALSE} +knitr::kable(metadata_df, format = "html", row.names = FALSE, escape = FALSE) +``` + +
+ +**NOTE:** The prefix of **“`r prefix`”** is used to identity attributes for **GISAID** submissions. + +
+ +#### Optional Attributes + +To include additional attributes to **`r title`** submissions, just append ``r prefix`` in front of the desired attributes. Here is a list of optional attributes: + +```{r include=TRUE, echo=FALSE, message=FALSE, warning=FALSE} +knitr::kable(optional_attributes_df, format = "html", row.names = FALSE, escape = FALSE) +``` + +

+ +

[* You are now ready to install ``r program`` and batch upload your submission*](`r github_pages_url`/articles/local_installation.html)

+ +


+ +Any questions or issues? Please report them on our Github issue tracker. + +
+ diff --git a/.Rproj.user/39CB7C5D/sources/session-644ed55e/7944D2DE b/.Rproj.user/39CB7C5D/sources/session-644ed55e/7944D2DE new file mode 100644 index 0000000..0ca3ee3 --- /dev/null +++ b/.Rproj.user/39CB7C5D/sources/session-644ed55e/7944D2DE @@ -0,0 +1,26 @@ +{ + "id": "7944D2DE", + "path": "~/Github/seqsender/_pkgdown.yml", + "project_path": "_pkgdown.yml", + "type": "yaml", + "hash": "1818887783", + "contents": "", + "dirty": false, + "created": 1707156312052.0, + "source_on_save": false, + "relative_order": 1, + "properties": { + "source_window_id": "", + "Source": "Source", + "cursorPosition": "12,0", + "scrollLine": "0" + }, + "folds": "", + "lastKnownWriteTime": 1707498977, + "encoding": "UTF-8", + "collab_server": "", + "source_window": "", + "last_content_update": 1707498977652, + "read_only": false, + "read_only_alternatives": [] +} \ No newline at end of file diff --git a/.Rproj.user/39CB7C5D/sources/session-644ed55e/7944D2DE-contents b/.Rproj.user/39CB7C5D/sources/session-644ed55e/7944D2DE-contents new file mode 100644 index 0000000..759910d --- /dev/null +++ b/.Rproj.user/39CB7C5D/sources/session-644ed55e/7944D2DE-contents @@ -0,0 +1,52 @@ +url: https://github.com/CDCgov/seqsender + +template: + params: + bootswatch: cosmo + +news: + one_page: false + +navbar: + structure: + right: [news, github] + components: + left: + - icon: fa-home + href: index.html + - text: Getting Started + menu: + - text: Prerequisites + href: articles/prerequisites.html + - text: Installation + - text: How to run seqsender locally + href: articles/local_installation.html + - text: How to run seqsender with Docker + href: articles/docker_installation.html + - text: How to run seqsender with Compose + href: articles/compose_installation.html + - text: How to run seqsender with Singularity + href: articles/singularity_installation.html + - text: Databases + menu: + - text: NCBI + - text: BioSample + href: articles/biosample_submission.html + - text: SRA + href: articles/sra_submission.html + - text: GenBank + href: articles/genbank_submission.html + - text: GISAID + - text: EpiFlu + href: articles/gisaid_flu_submission.html + - text: EpiCoV + href: articles/gisaid_cov_submission.html + - text: Support + menu: + - text: FAQs + href: articles/faqs.html + +footer: + structure: + left: developed_by + right: built_with \ No newline at end of file diff --git a/.Rproj.user/39CB7C5D/sources/session-644ed55e/8DC9B3D9 b/.Rproj.user/39CB7C5D/sources/session-644ed55e/8DC9B3D9 new file mode 100644 index 0000000..41e9d95 --- /dev/null +++ b/.Rproj.user/39CB7C5D/sources/session-644ed55e/8DC9B3D9 @@ -0,0 +1,26 @@ +{ + "id": "8DC9B3D9", + "path": "~/Github/seqsender/inst/CITATION", + "project_path": "inst/CITATION", + "type": "text", + "hash": "0", + "contents": "", + "dirty": false, + "created": 1707162478476.0, + "source_on_save": false, + "relative_order": 14, + "properties": { + "source_window_id": "", + "Source": "Source", + "cursorPosition": "11,1", + "scrollLine": "0" + }, + "folds": "", + "lastKnownWriteTime": 1707499229, + "encoding": "UTF-8", + "collab_server": "", + "source_window": "", + "last_content_update": 1707499229446, + "read_only": false, + "read_only_alternatives": [] +} \ No newline at end of file diff --git a/.Rproj.user/39CB7C5D/sources/session-644ed55e/8DC9B3D9-contents b/.Rproj.user/39CB7C5D/sources/session-644ed55e/8DC9B3D9-contents new file mode 100644 index 0000000..fb434fe --- /dev/null +++ b/.Rproj.user/39CB7C5D/sources/session-644ed55e/8DC9B3D9-contents @@ -0,0 +1,12 @@ + +citHeader("To cite seqsender in publications use:") + +citEntry( + entry = "manual", + title = "seqsender: Public Database Submission Pipeline", + author = "Dakota Howard, Reina Chau, Peter Cook, Kristine Lacek, Amanda Sullivan, Vikram Setlur, Thomas Stark, Brian Lee, Benjamin Rambo-Martin", + institution = "Centers for Disease Control and Prevention", + address = "1600 Clifton Road NE, Building 21, 8th Floor, Atlanta, Georgia 30333", + year = format(Sys.Date(), "%Y"), + textVersion = "Howard, D. et al. seqsender: Public Database Submission Pipeline." +) \ No newline at end of file diff --git a/.Rproj.user/39CB7C5D/sources/session-644ed55e/9233AB25-contents b/.Rproj.user/39CB7C5D/sources/session-644ed55e/9233AB25-contents new file mode 100644 index 0000000..8dada3e --- /dev/null +++ b/.Rproj.user/39CB7C5D/sources/session-644ed55e/9233AB25-contents @@ -0,0 +1,201 @@ + Apache License + Version 2.0, January 2004 + http://www.apache.org/licenses/ + + TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + + 1. Definitions. + + "License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. + + "Licensor" shall mean the copyright owner or entity authorized by + the copyright owner that is granting the License. + + "Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + + "Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation + source, and configuration files. + + "Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but + not limited to compiled object code, generated documentation, + and conversions to other media types. + + "Work" shall mean the work of authorship, whether in Source or + Object form, made available under the License, as indicated by a + copyright notice that is included in or attached to the work + (an example is provided in the Appendix below). + + "Derivative Works" shall mean any work, whether in Source or Object + form, that is based on (or derived from) the Work and for which the + editorial revisions, annotations, elaborations, or other modifications + represent, as a whole, an original work of authorship. For the purposes + of this License, Derivative Works shall not include works that remain + separable from, or merely link (or bind by name) to the interfaces of, + the Work and Derivative Works thereof. + + "Contribution" shall mean any work of authorship, including + the original version of the Work and any modifications or additions + to that Work or Derivative Works thereof, that is intentionally + submitted to Licensor for inclusion in the Work by the copyright owner + or by an individual or Legal Entity authorized to submit on behalf of + the copyright owner. For the purposes of this definition, "submitted" + means any form of electronic, verbal, or written communication sent + to the Licensor or its representatives, including but not limited to + communication on electronic mailing lists, source code control systems, + and issue tracking systems that are managed by, or on behalf of, the + Licensor for the purpose of discussing and improving the Work, but + excluding communication that is conspicuously marked or otherwise + designated in writing by the copyright owner as "Not a Contribution." + + "Contributor" shall mean Licensor and any individual or Legal Entity + on behalf of whom a Contribution has been received by Licensor and + subsequently incorporated within the Work. + + 2. Grant of Copyright License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + copyright license to reproduce, prepare Derivative Works of, + publicly display, publicly perform, sublicense, and distribute the + Work and such Derivative Works in Source or Object form. + + 3. Grant of Patent License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + (except as stated in this section) patent license to make, have made, + use, offer to sell, sell, import, and otherwise transfer the Work, + where such license applies only to those patent claims licensable + by such Contributor that are necessarily infringed by their + Contribution(s) alone or by combination of their Contribution(s) + with the Work to which such Contribution(s) was submitted. If You + institute patent litigation against any entity (including a + cross-claim or counterclaim in a lawsuit) alleging that the Work + or a Contribution incorporated within the Work constitutes direct + or contributory patent infringement, then any patent licenses + granted to You under this License for that Work shall terminate + as of the date such litigation is filed. + + 4. Redistribution. You may reproduce and distribute copies of the + Work or Derivative Works thereof in any medium, with or without + modifications, and in Source or Object form, provided that You + meet the following conditions: + + (a) You must give any other recipients of the Work or + Derivative Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices + stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works + that You distribute, all copyright, patent, trademark, and + attribution notices from the Source form of the Work, + excluding those notices that do not pertain to any part of + the Derivative Works; and + + (d) If the Work includes a "NOTICE" text file as part of its + distribution, then any Derivative Works that You distribute must + include a readable copy of the attribution notices contained + within such NOTICE file, excluding those notices that do not + pertain to any part of the Derivative Works, in at least one + of the following places: within a NOTICE text file distributed + as part of the Derivative Works; within the Source form or + documentation, if provided along with the Derivative Works; or, + within a display generated by the Derivative Works, if and + wherever such third-party notices normally appear. The contents + of the NOTICE file are for informational purposes only and + do not modify the License. You may add Your own attribution + notices within Derivative Works that You distribute, alongside + or as an addendum to the NOTICE text from the Work, provided + that such additional attribution notices cannot be construed + as modifying the License. + + You may add Your own copyright statement to Your modifications and + may provide additional or different license terms and conditions + for use, reproduction, or distribution of Your modifications, or + for any such Derivative Works as a whole, provided Your use, + reproduction, and distribution of the Work otherwise complies with + the conditions stated in this License. + + 5. Submission of Contributions. Unless You explicitly state otherwise, + any Contribution intentionally submitted for inclusion in the Work + by You to the Licensor shall be under the terms and conditions of + this License, without any additional terms or conditions. + Notwithstanding the above, nothing herein shall supersede or modify + the terms of any separate license agreement you may have executed + with Licensor regarding such Contributions. + + 6. Trademarks. This License does not grant permission to use the trade + names, trademarks, service marks, or product names of the Licensor, + except as required for reasonable and customary use in describing the + origin of the Work and reproducing the content of the NOTICE file. + + 7. Disclaimer of Warranty. Unless required by applicable law or + agreed to in writing, Licensor provides the Work (and each + Contributor provides its Contributions) on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied, including, without limitation, any warranties or conditions + of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + PARTICULAR PURPOSE. You are solely responsible for determining the + appropriateness of using or redistributing the Work and assume any + risks associated with Your exercise of permissions under this License. + + 8. Limitation of Liability. In no event and under no legal theory, + whether in tort (including negligence), contract, or otherwise, + unless required by applicable law (such as deliberate and grossly + negligent acts) or agreed to in writing, shall any Contributor be + liable to You for damages, including any direct, indirect, special, + incidental, or consequential damages of any character arising as a + result of this License or out of the use or inability to use the + Work (including but not limited to damages for loss of goodwill, + work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses), even if such Contributor + has been advised of the possibility of such damages. + + 9. Accepting Warranty or Additional Liability. While redistributing + the Work or Derivative Works thereof, You may choose to offer, + and charge a fee for, acceptance of support, warranty, indemnity, + or other liability obligations and/or rights consistent with this + License. However, in accepting such obligations, You may act only + on Your own behalf and on Your sole responsibility, not on behalf + of any other Contributor, and only if You agree to indemnify, + defend, and hold each Contributor harmless for any liability + incurred by, or claims asserted against, such Contributor by reason + of your accepting any such warranty or additional liability. + + END OF TERMS AND CONDITIONS + + APPENDIX: How to apply the Apache License to your work. + + To apply the Apache License to your work, attach the following + boilerplate notice, with the fields enclosed by brackets "{}" + replaced with your own identifying information. (Don't include + the brackets!) The text should be enclosed in the appropriate + comment syntax for the file format. We also recommend that a + file or class name and description of purpose be included on the + same "printed page" as the copyright notice for easier + identification within third-party archives. + + Copyright {yyyy} {name of copyright owner} + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. diff --git a/.Rproj.user/39CB7C5D/sources/session-644ed55e/B0A067F1 b/.Rproj.user/39CB7C5D/sources/session-644ed55e/B0A067F1 new file mode 100644 index 0000000..b873f02 --- /dev/null +++ b/.Rproj.user/39CB7C5D/sources/session-644ed55e/B0A067F1 @@ -0,0 +1,26 @@ +{ + "id": "B0A067F1", + "path": "~/Github/seqsender/DESCRIPTION", + "project_path": "DESCRIPTION", + "type": "dcf", + "hash": "2980913655", + "contents": "", + "dirty": false, + "created": 1707156178289.0, + "source_on_save": false, + "relative_order": 3, + "properties": { + "source_window_id": "", + "Source": "Source", + "cursorPosition": "41,0", + "scrollLine": "11" + }, + "folds": "", + "lastKnownWriteTime": 1707497400, + "encoding": "UTF-8", + "collab_server": "", + "source_window": "", + "last_content_update": 1707497400958, + "read_only": false, + "read_only_alternatives": [] +} \ No newline at end of file diff --git a/.Rproj.user/39CB7C5D/sources/session-644ed55e/B0A067F1-contents b/.Rproj.user/39CB7C5D/sources/session-644ed55e/B0A067F1-contents new file mode 100644 index 0000000..1a7ddf7 --- /dev/null +++ b/.Rproj.user/39CB7C5D/sources/session-644ed55e/B0A067F1-contents @@ -0,0 +1,43 @@ +Package: seqsender +Type: Package +Title: Public Database Submission Pipeline +Version: 1.1.0 +Authors@R: + c( + person(given="Dakota", family="Howard", role=c("aut","cre"), + email="psv4@cdc.gov", comment=c(ORCID="0000-0002-7674-0385")), + person(given="Reina", family="Chau", role=c("aut"), + email="snu3@cdc.gov", comment=c(ORCID="0000-0003-3012-1404")), + person(given="Peter", family="Cook", role=c("aut"), + email="ooj4@cdc.gov"), + person(given="Kristine", family="Lacek", role=c("aut"), + email="qgx6@cdc.gov", comment=c(ORCID="0000-0002-6247-5082")), + person(given="Amanda", family="Sullivan", role=c("aut"), + email="xpa3@cdc.gov"), + person(given="Vikram", family="Setlur", role=c("aut"), + email="xoe7@cdc.gov"), + person(given="Thomas", family="Stark", role=c("aut"), + email="ynh4@cdc.gov"), + person(given="Brian", family="Lee", role=c("aut"), + email="fya1@cdc.gov"), + person(given="Benjamin", family="Rambo-Martin", role=c("aut"), + email="nbx0@cdc.gov", comment=c(ORCID="0000-0002-8591-3954")) + ) +Description: seqsender is a Python program that is designed to automate the process of generating + necessary submission files (e.g. submission.xml, submission.zip, etc.) + and then bulk uploading them via FTP to NCBI archives such as Genbank, BioSample, and SRA. + Additionally, the program can batch uploading submissions of meta- and sequence-data to GISAID + using their Command Line Interface Tools (e.g., EpiFlu and EpiCoV CLI). + Currently, the pipeline is capable of uploading Influenza A Virus and SARS-COV-2 data. +RoxygenNote: 7.2.3 +License: Apache License (== 2.0) + file LICENSE +URL: https://github.com/CDCgov/seqsender +GITHUB_PAGES: https://cdcgov.github.io/seqsender +Docker: cdcgov/seqsender-dev:latest +Encoding: UTF-8 +VignetteBuilder: knitr +BugReports: https://github.com/CDCgov/seqsender/issues + + + + diff --git a/.Rproj.user/39CB7C5D/sources/session-644ed55e/B451F5CC b/.Rproj.user/39CB7C5D/sources/session-644ed55e/B451F5CC new file mode 100644 index 0000000..ebabc70 --- /dev/null +++ b/.Rproj.user/39CB7C5D/sources/session-644ed55e/B451F5CC @@ -0,0 +1,26 @@ +{ + "id": "B451F5CC", + "path": "~/Github/seqsender/vignettes/local_installation.Rmd", + "project_path": "vignettes/local_installation.Rmd", + "type": "r_markdown", + "hash": "3358203343", + "contents": "", + "dirty": false, + "created": 1707156979196.0, + "source_on_save": false, + "relative_order": 6, + "properties": { + "source_window_id": "", + "Source": "Source", + "cursorPosition": "159,7", + "scrollLine": "148" + }, + "folds": "", + "lastKnownWriteTime": 1707512277, + "encoding": "UTF-8", + "collab_server": "", + "source_window": "", + "last_content_update": 1707512277383, + "read_only": false, + "read_only_alternatives": [] +} \ No newline at end of file diff --git a/.Rproj.user/39CB7C5D/sources/session-644ed55e/B451F5CC-contents b/.Rproj.user/39CB7C5D/sources/session-644ed55e/B451F5CC-contents new file mode 100644 index 0000000..e7f9e29 --- /dev/null +++ b/.Rproj.user/39CB7C5D/sources/session-644ed55e/B451F5CC-contents @@ -0,0 +1,544 @@ +--- +output: rmarkdown::html_document +title: "How to run seqsender locally" +vignette: > + %\VignetteIndexEntry{How to run seqsender locally} + %\VignetteEngine{knitr::rmarkdown} + %\VignetteEncoding{UTF-8} +--- + +```{r include=FALSE, echo=FALSE, message=FALSE, warning=FALSE} +# R libraries +library(yaml) # for yaml file + +# Read in the DESCRIPTION file +description <- yaml::read_yaml("../DESCRIPTION") + +# Define variables +program <- description$Package + +# Define github repo +github_repo <- description$URL + +# Define github pages URL +github_pages_url <- description$GITHUB_PAGES +``` + + + +**SOFTWARE REQUIREMENTS:** + +- Linux (64-bit) or Mac OS X (64-bit) +- Git version 2.25.1 or later +- Standard utilities: curl, tar, unzip + +**ADDITIONAL REQUIREMENTS:** + +See [PRE-REQUISITES](`r github_pages_url`/index.html#prerequisites) and [REQUIREMENT FILES](`r github_pages_url`/index.html#requirement-files) before proceeding to the next steps + +## Micromamba Installation + +Here we recommend using **micromamba** to set up a virtual environment to run ``r program``. **Micromamba** is a tiny, statically linked C++ reimplementation of mamba which is an alternative to conda. The tool works as a standalone package manager that supports a subset of all mamba or conda commands, but it also has its own separate command line interfaces. For more information, visit [micromamba documentation](https://mamba.readthedocs.io/en/latest/user_guide/micromamba.html). + +To manually install, download and unzip the executable from the official **conda-forge** package to your `$HOME` directory using `tar`. + +```bash +cd $HOME +``` + +- LINUX + +```bash +# Linux Intel (x86_64): +curl -Ls https://micro.mamba.pm/api/micromamba/linux-64/latest | tar -xvj bin/micromamba +# Linux ARM64: +curl -Ls https://micro.mamba.pm/api/micromamba/linux-aarch64/latest | tar -xvj bin/micromamba +# Linux Power: +curl -Ls https://micro.mamba.pm/api/micromamba/linux-ppc64le/latest | tar -xvj bin/micromamba +``` + +- macOS + +```bash +# macOS Intel (x86_64): +curl -Ls https://micro.mamba.pm/api/micromamba/osx-64/latest | tar -xvj bin/micromamba +# macOS Silicon/M1 (ARM64): +curl -Ls https://micro.mamba.pm/api/micromamba/osx-arm64/latest | tar -xvj bin/micromamba +``` + +After the extraction is completed, you can find the executable at `$HOME/bin/micromamba` + +- To quickly use `micromamba`, you can simply run + +```bash +export MAMBA_ROOT_PREFIX="$HOME/micromamba" +eval "$($HOME/bin/micromamba shell hook -s posix)" +``` + +- To persist using `micromamba`, you can append the following script to your `.bashrc` (or `.zshrc`) + +```bash +# >>> mamba initialize >>> +export MAMBA_EXE="$HOME/bin/micromamba"; +export MAMBA_ROOT_PREFIX="$HOME/micromamba"; +__mamba_setup="$("$MAMBA_EXE" shell hook --shell bash --root-prefix "$MAMBA_ROOT_PREFIX" 2> /dev/null)" +if [ $? -eq 0 ]; then + eval "$__mamba_setup" +else + alias micromamba="$MAMBA_EXE" # Fallback on help from mamba activate +fi +unset __mamba_setup +# <<< mamba initialize <<< +``` + +- To check the current version of `micromamba` + +```bash +micromamba --version +1.5.6 +``` + +## Set up a `micromamba` environment + +1. Clone this repository to your `$HOME` directory + +```bash +cd $HOME +git clone `r github_repo`.git +``` + +2. `CD` to **seqsender** folder where the `env.yaml` file is stored. Let's create a virtual environment named **mamba** that contains all dependencies needed to run ``r program`` from the source file. + +```bash +cd seqsender +micromamba create --name mamba --file env.yaml +``` + +![](images/micromamba-env.png) + +3. Activate the named environment -- **mamba** + +```bash +micromamba activate mamba +``` + +## Run ``r program`` within the `mamba` environment + +First, let's look a list of commands in ``r program``. Currently, there are five implemented commands in ``r program``: `prep`, `submit`, `check_submission_status`, `template`, `version`. + +```bash +python seqsender.py --help +``` + +![](images/seqsender.png) +
+ +To see the arguments required for each command, for example, the `submit` command, run + +```bash +python seqsender.py submit --help +``` + +![](images/seqsender-submit.png) + + +## Submit a `test` submission + +Rather than hastily jump in and submit a `production` submission right away, we can utilize GISAID's and NCBI's **“TEST-SERVER”** to upload a `test` submission first. That way submitter can familiarize themselves with the submission process prior to make a real submission. + +**Note:** Duplicate test submissions will result in an error. Please create new sequence names each time you plan to run test submissions to avoid this issue. + +### Submit a `test` submission with a pre-processed dataset + + + +### Submit a `test` submission with your own dataset + + + +


+ +Any questions or issues? Please report them on our Github issue tracker. + +
+ + + + + diff --git a/.Rproj.user/39CB7C5D/sources/session-644ed55e/B5A31C01 b/.Rproj.user/39CB7C5D/sources/session-644ed55e/B5A31C01 new file mode 100644 index 0000000..d4dc436 --- /dev/null +++ b/.Rproj.user/39CB7C5D/sources/session-644ed55e/B5A31C01 @@ -0,0 +1,26 @@ +{ + "id": "B5A31C01", + "path": "~/Github/seqsender/.gitignore", + "project_path": ".gitignore", + "type": "gitignore", + "hash": "0", + "contents": "", + "dirty": false, + "created": 1707162178817.0, + "source_on_save": false, + "relative_order": 13, + "properties": { + "source_window_id": "", + "Source": "Source", + "cursorPosition": "18,0", + "scrollLine": "0" + }, + "folds": "", + "lastKnownWriteTime": 1707163152, + "encoding": "UTF-8", + "collab_server": "", + "source_window": "", + "last_content_update": 1707163152318, + "read_only": false, + "read_only_alternatives": [] +} \ No newline at end of file diff --git a/.Rproj.user/39CB7C5D/sources/session-644ed55e/B5A31C01-contents b/.Rproj.user/39CB7C5D/sources/session-644ed55e/B5A31C01-contents new file mode 100644 index 0000000..ebbfb40 --- /dev/null +++ b/.Rproj.user/39CB7C5D/sources/session-644ed55e/B5A31C01-contents @@ -0,0 +1,18 @@ +config_files/test_config_*yaml +~$* +__pycache__ +__pycache__/biosample_sra_submission.cpython-36.pyc +__pycache__/genbank_submission.cpython-36.pyc +__pycache__/gisaid_submission.cpython-36.pyc +__pycache__/submission_preparation.cpython-36.pyc +test_input/~$st Submission Instructions.docx +submit.ready +*report.xml +test_input/test_metadata.tsv +upload_log.csv +*.vscode +*.Rproj +.Rproj.user +.Rhistory +.Rbuildignore +docker-compose-*.yaml diff --git a/.Rproj.user/39CB7C5D/sources/session-644ed55e/BF48D78B b/.Rproj.user/39CB7C5D/sources/session-644ed55e/BF48D78B new file mode 100644 index 0000000..cf9d6c9 --- /dev/null +++ b/.Rproj.user/39CB7C5D/sources/session-644ed55e/BF48D78B @@ -0,0 +1,26 @@ +{ + "id": "BF48D78B", + "path": "~/Github/seqsender/vignettes/singularity_installation.Rmd", + "project_path": "vignettes/singularity_installation.Rmd", + "type": "r_markdown", + "hash": "562811111", + "contents": "", + "dirty": false, + "created": 1707157073615.0, + "source_on_save": false, + "relative_order": 7, + "properties": { + "source_window_id": "", + "Source": "Source", + "cursorPosition": "135,7", + "scrollLine": "118" + }, + "folds": "", + "lastKnownWriteTime": 1707512279, + "encoding": "UTF-8", + "collab_server": "", + "source_window": "", + "last_content_update": 1707512279812, + "read_only": false, + "read_only_alternatives": [] +} \ No newline at end of file diff --git a/.Rproj.user/39CB7C5D/sources/session-644ed55e/BF48D78B-contents b/.Rproj.user/39CB7C5D/sources/session-644ed55e/BF48D78B-contents new file mode 100644 index 0000000..760653c --- /dev/null +++ b/.Rproj.user/39CB7C5D/sources/session-644ed55e/BF48D78B-contents @@ -0,0 +1,540 @@ +--- +output: rmarkdown::html_document +title: "How to run seqsender with Singularity" +vignette: > + %\VignetteIndexEntry{How to run seqsender with Singularity"} + %\VignetteEngine{knitr::rmarkdown} + %\VignetteEncoding{UTF-8} +--- + +```{r, include=FALSE, echo=FALSE, message=FALSE, warning=FALSE} +# R libraries +library(yaml) # for yaml file + +# Read in the DESCRIPTION file +description <- yaml::read_yaml("../DESCRIPTION") + +# Define variables +program <- description$Package + +# Get the docker image +docker_image <- description$Docker + +# Define github repo +github_repo <- description$URL + +# Define github pages URL +github_pages_url <- description$GITHUB_PAGES +``` + + + +**SOFTWARE REQUIREMENTS:** + +- Linux (64-bit) or Mac OS X (64-bit) +- Git version 2.25.1 or later +- Singularity version 3.8.7 or later +- Standard utilities: curl, tar, unzip + +**ADDITIONAL REQUIREMENTS:** + +See [PRE-REQUISITES](`r github_pages_url`/index.html#prerequisites) and [REQUIREMENT FILES](`r github_pages_url`/index.html#requirement-files) before proceeding to the next steps + + +### (1) Convert `r program` Docker image into a Singularity image + +There is a ``r program`` Docker image already built and stored on our DockerHub registry: **`r docker_image`**. You can directly pull the Docker Image down from the registry, convert it into a Singularity image, and store it in a destination of your choice. + +``` bash +singularity build ~/singularity/seqsender.sif docker://cdcgov/seqsender-dev:latest +``` + +### (2) After the Singularity image is built successfully, we can go ahead and use it to run ``r program``. + +Here is the command that shows the help messages of ``r program`` + +``` bash +mkdir ~/singularity +singularity exec ~/singularity/seqsender.sif seqsender-kickoff --help +``` + +Below is the standard out of the command. + +``` bash +usage: `r program`.py [-h] + {prep,submit,check_submission_status,template,version} ... + +Automate the process of batch uploading consensus sequences and metadata to +databases of your choices + +positional arguments: + {prep,submit,check_submission_status,template,version} + +optional arguments: + -h, --help show this help message and exit +``` + +To see the arguments required for each command, for example, the `submit` command, run + +```bash +singularity exec ~/singularity/seqsender.sif seqsender-kickoff submit --help +``` + +```bash +usage: seqsender.py submit [-h] [--biosample] [--sra] [--genbank] [--gisaid] + --organism {FLU,COV} --submission_name + SUBMISSION_NAME --submission_dir SUBMISSION_DIR + --config_file CONFIG_FILE --metadata_file + METADATA_FILE --fasta_file FASTA_FILE [--table2asn] + [--gff_file GFF_FILE] [--test] + +Create submission files and then batch uploading them to databases of choices. + +optional arguments: + -h, --help show this help message and exit + --biosample, -b Submit to BioSample database. (default: ) + --sra, -s Submit to SRA database. (default: ) + --genbank, -n Submit to Genbank database. (default: ) + --gisaid, -g Submit to GISAID database. (default: ) + --organism {FLU,COV} Type of organism data (default: FLU) + --submission_name SUBMISSION_NAME + Name of the submission (default: None) + --submission_dir SUBMISSION_DIR + Directory to where all required files (such as + metadata, fasta, etc.) are stored (default: None) + --config_file CONFIG_FILE + Config file stored in submission directory (default: + None) + --metadata_file METADATA_FILE + Metadata file stored in submission directory (default: + None) + --fasta_file FASTA_FILE + Fasta file stored in submission directory (default: + None) + --table2asn Whether to prepare a Table2asn submission. (default: + False) + --gff_file GFF_FILE An annotation file to add to a Table2asn submission + (default: None) + --test Whether to perform a test submission. (default: False) +``` + +### (3) Submit a `test` submission + +Rather than hastily jump in and submit a `production` submission right away, we can utilize GISAID's and NCBI's **“TEST-SERVER”** to upload a `test` submission first. That way submitter can familiarize themselves with the submission process prior to make a real submission. + +**Note:** Duplicate test submissions will result in an error. Please create new sequence names each time you plan to run test submissions to avoid this issue. + +### Submit a `test` submission with a pre-processed dataset + + + +### Submit a `test` submission with your own dataset + + + +


+ +Any questions or issues? Please report them on our Github issue tracker. + +
+ + + + + diff --git a/.Rproj.user/39CB7C5D/sources/session-644ed55e/C69D906D-contents b/.Rproj.user/39CB7C5D/sources/session-644ed55e/C69D906D-contents new file mode 100644 index 0000000..3858421 --- /dev/null +++ b/.Rproj.user/39CB7C5D/sources/session-644ed55e/C69D906D-contents @@ -0,0 +1,168 @@ +--- +title: "NCBI - SRA" +output: rmarkdown::html_document +vignette: > + %\VignetteIndexEntry{NCBI - SRA} + %\VignetteEngine{knitr::rmarkdown} + %\VignetteEncoding{UTF-8} +--- + + + + +```{r, include=FALSE, echo=FALSE, message=FALSE, warning=FALSE} +# R libraries +library(knitr) # for html table +library(yaml) # for yaml file +library(tidyverse) # for pipe +library(reshape2) # for data manipulation + +# Read in the DESCRIPTION file +description <- yaml::read_yaml("../DESCRIPTION") + +# Define variables +program <- description$Package +title <- "SRA" +prefix <- "sra-" +prefix_examples <- c("sra-loader", "sra-platform") +portals <- c("NCBI", "NCBI", "NCBI", "GISAID", "GISAID") +databases <- c("BIOSAMPLE", "SRA", "GENBANK", "FLU", "COV") +organism <- c("Influenza A Virus", "SARS-COV-2") +organism_abbrev <- c("FLU", "COV") + +# Define github repo +github_repo <- description$URL + +# Define github pages URL +github_pages_url <- description$GITHUB_PAGES + +# Create main config data frame +main_config_df <- data.frame( + portals = portals, + databases = databases +) %>% +dplyr::filter( + databases %in% toupper(!!title) +) + +# Read in data files +main_config_file <- yaml::read_yaml("../config/main_config.yaml") + +# Store all required fields +metadata_df <- reshape2::melt(main_config_file$SUBMISSION_PORTAL$COMMON_FIELDS) %>% + dplyr::transmute( + Column_name = gsub("[*&?#]", "", L1), + Description = value + ) + +# Combine all fields in given databases and portals +for(d in 1:nrow(main_config_df)){ + #d=1 + database <- main_config_df$databases[d] + portal <- main_config_df$portals[which(main_config_df$databases %in% database)] + + if("COMMON_FIELDS" %in% names(main_config_file$SUBMISSION_PORTAL$PORTAL_NAMES[[portal]])){ + portal_fields <- reshape2::melt(main_config_file$SUBMISSION_PORTAL$PORTAL_NAMES[[portal]]$COMMON_FIELDS) %>% + dplyr::transmute( + Column_name = gsub("[*&?#]", "", L1), + Description = value + ) + + metadata_df <- metadata_df %>% + dplyr::bind_rows(portal_fields) %>% + dplyr::distinct(.keep_all = TRUE) + + } + + database_fields <- reshape2::melt(main_config_file$SUBMISSION_PORTAL$PORTAL_NAMES[[portal]]$DATABASE[[database]]) %>% + dplyr::transmute( + Column_name = gsub("[*&?#]", "", L1), + Description = value + ) + + metadata_df <- metadata_df %>% + dplyr::bind_rows(database_fields) %>% + dplyr::distinct(.keep_all = TRUE) + +} +``` + +## Overview + +**Sequence Read Archive (SRA)** data, available through multiple cloud providers and NCBI servers, is the largest publicly available repository of high throughput sequencing data. The archive accepts data from all branches of life as well as metagenomic and environmental surveys. **SRA** stores raw sequencing data and alignment information to enhance reproducibility and facilitate new discoveries through data analysis. + +Before submitters can upload sequence read archives to **`r title`** database using ``r program``,they must ensure the requirement files (such as `config.yaml`, `metadata.csv`, `sequence.fasta`, `raw reads`, etc.) are already prepared ahead of time and stored them in a submission folder of choice (e.g., `submission_name`) within a parent submission directory (e.g., `submission_dir`). That way ``r program`` will be able to scoop up the necessary files in that folder, generate submission files, and then batch uploading them to the submitting database of choices. + +## Requirement files + +- [Config file](#config-file) in a `yaml` format +- [Sequence read archives](#sequence-read-archives) in a `bam/sff/hdf5/fastq` format +- [Metadata file](#metadata-file) in a `csv` format + +A quick look of where to store all of the requirement files + +![](images/submission_dir.png) + +### Config file + +Config file is a yaml file that provides a brief description about the submission and contains user credentials that allow ``r program`` to authenticate the database prior to upload a submission. + +![](images/config_file.png) + +:::{style="padding: 10px; border: 1px solid blue !important;"} + **NOTE:**
+ +- To submit to NCBI only, one can remove the **GISAID Submission (b)** section from the config file. Vice versa, to submit to GISAID only, just remove the **NCBI Submission (a)** section.
+- **Submission_Position** determines the order of databases in which we will submit to first. For instance, if GISAID is set as `1`, **_`r program`_** will submit to GISAID first, then after all samples are assigned with a GISAID accession number, **_`r program`_** will proceed to submit to NCBI. This order of submission ensures samples are linked correctly between the two databases.
+- **Username** and **Password** under the **NCBI Submission (b)** section are the credentials used to authenticate the **NCBI FTP Server** (not to mistake with individual NCBI account). See [PRE-REQUISITES](`r github_pages_url`/index.html#prerequisites) for more details. +::: + +### Sequence read archives + +Currently, NCBI accepts binary files such as BAM, SFF, and HDF5 formats and text formats such as FASTQ. See [SRA Submit Formats](https://www.ncbi.nlm.nih.gov/sra/docs/submitformats/) for more details. + +:::{style="padding: 10px; border: 1px solid blue !important;"} + **NOTE:**
+ +- Sequence read archive for all samples must be stored in a subfolder called `raw_reads` inside a submission folder of choice +::: + +
+ +### Metadata file + +The metadata worksheet is a comma-delimited (csv) file that contains required attributes that are useful for the rapid analysis and trace back of **`r paste0(organism, collapse=" or ")`** cases. + +Here is a short description about the fields in the metadata worksheet. + +```{r include=TRUE, echo=FALSE, message=FALSE, warning=FALSE} +knitr::kable(metadata_df, format = "html", row.names = FALSE, escape = FALSE) +``` + +
+ +**NOTE:** The prefix of **“`r prefix`”** is used to identity attributes for **`r title`** submissions + +To include additional attributes to **`r title`** submissions, just append ``r prefix`` in front of the desired attributes, e.g. ``r paste0( prefix_examples, collapse=", ")``, etc. See [SRA metadata section](https://www.ncbi.nlm.nih.gov/sra/docs/submitmeta/) for more details. + +

+ +

[* You are now ready to install ``r program`` and batch upload your submission*](`r github_pages_url`/articles/local_installation.html)

+ +


+ +Any questions or issues? Please report them on our Github issue tracker. + +
+ diff --git a/.Rproj.user/shared/notebooks/24F5CC26-gisaid_flu_submission/1/39CB7C5D644ed55e/chunks.json b/.Rproj.user/shared/notebooks/24F5CC26-gisaid_flu_submission/1/39CB7C5D644ed55e/chunks.json new file mode 100644 index 0000000..08ff90e --- /dev/null +++ b/.Rproj.user/shared/notebooks/24F5CC26-gisaid_flu_submission/1/39CB7C5D644ed55e/chunks.json @@ -0,0 +1 @@ +{"chunk_definitions":[],"doc_write_time":1707160579} \ No newline at end of file diff --git a/.Rproj.user/shared/notebooks/24F5CC26-gisaid_flu_submission/1/s/chunks.json b/.Rproj.user/shared/notebooks/24F5CC26-gisaid_flu_submission/1/s/chunks.json new file mode 100644 index 0000000..08ff90e --- /dev/null +++ b/.Rproj.user/shared/notebooks/24F5CC26-gisaid_flu_submission/1/s/chunks.json @@ -0,0 +1 @@ +{"chunk_definitions":[],"doc_write_time":1707160579} \ No newline at end of file diff --git a/.Rproj.user/shared/notebooks/289371A6-genbank_submission/1/s/chunks.json b/.Rproj.user/shared/notebooks/289371A6-genbank_submission/1/s/chunks.json new file mode 100644 index 0000000..38b3f11 --- /dev/null +++ b/.Rproj.user/shared/notebooks/289371A6-genbank_submission/1/s/chunks.json @@ -0,0 +1 @@ +{"chunk_definitions":[],"doc_write_time":1707160691} \ No newline at end of file diff --git a/.Rproj.user/shared/notebooks/5ED2DC74-local_installation/1/39CB7C5D644ed55e/chunks.json b/.Rproj.user/shared/notebooks/5ED2DC74-local_installation/1/39CB7C5D644ed55e/chunks.json new file mode 100644 index 0000000..b8dafdd --- /dev/null +++ b/.Rproj.user/shared/notebooks/5ED2DC74-local_installation/1/39CB7C5D644ed55e/chunks.json @@ -0,0 +1 @@ +{"chunk_definitions":[],"doc_write_time":1707157055} \ No newline at end of file diff --git a/.Rproj.user/shared/notebooks/5ED2DC74-local_installation/1/s/chunks.json b/.Rproj.user/shared/notebooks/5ED2DC74-local_installation/1/s/chunks.json new file mode 100644 index 0000000..b8dafdd --- /dev/null +++ b/.Rproj.user/shared/notebooks/5ED2DC74-local_installation/1/s/chunks.json @@ -0,0 +1 @@ +{"chunk_definitions":[],"doc_write_time":1707157055} \ No newline at end of file diff --git a/.Rproj.user/shared/notebooks/67CC0C7B-docker_installation/1/39CB7C5D644ed55e/chunks.json b/.Rproj.user/shared/notebooks/67CC0C7B-docker_installation/1/39CB7C5D644ed55e/chunks.json new file mode 100644 index 0000000..a3db2e2 --- /dev/null +++ b/.Rproj.user/shared/notebooks/67CC0C7B-docker_installation/1/39CB7C5D644ed55e/chunks.json @@ -0,0 +1 @@ +{"chunk_definitions":[],"doc_write_time":1707157101} \ No newline at end of file diff --git a/.Rproj.user/shared/notebooks/67CC0C7B-docker_installation/1/s/chunks.json b/.Rproj.user/shared/notebooks/67CC0C7B-docker_installation/1/s/chunks.json new file mode 100644 index 0000000..a3db2e2 --- /dev/null +++ b/.Rproj.user/shared/notebooks/67CC0C7B-docker_installation/1/s/chunks.json @@ -0,0 +1 @@ +{"chunk_definitions":[],"doc_write_time":1707157101} \ No newline at end of file diff --git a/.Rproj.user/shared/notebooks/917137EC-compose_installation/1/39CB7C5D644ed55e/chunks.json b/.Rproj.user/shared/notebooks/917137EC-compose_installation/1/39CB7C5D644ed55e/chunks.json new file mode 100644 index 0000000..d1a4f71 --- /dev/null +++ b/.Rproj.user/shared/notebooks/917137EC-compose_installation/1/39CB7C5D644ed55e/chunks.json @@ -0,0 +1 @@ +{"chunk_definitions":[],"doc_write_time":1707157120} \ No newline at end of file diff --git a/.Rproj.user/shared/notebooks/917137EC-compose_installation/1/s/chunks.json b/.Rproj.user/shared/notebooks/917137EC-compose_installation/1/s/chunks.json new file mode 100644 index 0000000..d1a4f71 --- /dev/null +++ b/.Rproj.user/shared/notebooks/917137EC-compose_installation/1/s/chunks.json @@ -0,0 +1 @@ +{"chunk_definitions":[],"doc_write_time":1707157120} \ No newline at end of file diff --git a/.Rproj.user/shared/notebooks/9FCF16B4-singularity_installation/1/39CB7C5D644ed55e/chunks.json b/.Rproj.user/shared/notebooks/9FCF16B4-singularity_installation/1/39CB7C5D644ed55e/chunks.json new file mode 100644 index 0000000..6d07b4a --- /dev/null +++ b/.Rproj.user/shared/notebooks/9FCF16B4-singularity_installation/1/39CB7C5D644ed55e/chunks.json @@ -0,0 +1 @@ +{"chunk_definitions":[],"doc_write_time":1707157083} \ No newline at end of file diff --git a/.Rproj.user/shared/notebooks/9FCF16B4-singularity_installation/1/s/chunks.json b/.Rproj.user/shared/notebooks/9FCF16B4-singularity_installation/1/s/chunks.json new file mode 100644 index 0000000..6d07b4a --- /dev/null +++ b/.Rproj.user/shared/notebooks/9FCF16B4-singularity_installation/1/s/chunks.json @@ -0,0 +1 @@ +{"chunk_definitions":[],"doc_write_time":1707157083} \ No newline at end of file diff --git a/.Rproj.user/shared/notebooks/D111BEFE-biosample_submission/1/39CB7C5D644ed55e/chunks.json b/.Rproj.user/shared/notebooks/D111BEFE-biosample_submission/1/39CB7C5D644ed55e/chunks.json new file mode 100644 index 0000000..85a5415 --- /dev/null +++ b/.Rproj.user/shared/notebooks/D111BEFE-biosample_submission/1/39CB7C5D644ed55e/chunks.json @@ -0,0 +1 @@ +{"chunk_definitions":[],"doc_write_time":1707160709} \ No newline at end of file diff --git a/.Rproj.user/shared/notebooks/D111BEFE-biosample_submission/1/s/chunks.json b/.Rproj.user/shared/notebooks/D111BEFE-biosample_submission/1/s/chunks.json new file mode 100644 index 0000000..85a5415 --- /dev/null +++ b/.Rproj.user/shared/notebooks/D111BEFE-biosample_submission/1/s/chunks.json @@ -0,0 +1 @@ +{"chunk_definitions":[],"doc_write_time":1707160709} \ No newline at end of file diff --git a/.Rproj.user/shared/notebooks/D5A86BB8-gisaid_cov_submission/1/s/chunks.json b/.Rproj.user/shared/notebooks/D5A86BB8-gisaid_cov_submission/1/s/chunks.json new file mode 100644 index 0000000..34b3a00 --- /dev/null +++ b/.Rproj.user/shared/notebooks/D5A86BB8-gisaid_cov_submission/1/s/chunks.json @@ -0,0 +1 @@ +{"chunk_definitions":[],"doc_write_time":1707160672} \ No newline at end of file diff --git a/.Rproj.user/shared/notebooks/EAFD2BFE-sra_submission/1/s/chunks.json b/.Rproj.user/shared/notebooks/EAFD2BFE-sra_submission/1/s/chunks.json new file mode 100644 index 0000000..4990000 --- /dev/null +++ b/.Rproj.user/shared/notebooks/EAFD2BFE-sra_submission/1/s/chunks.json @@ -0,0 +1 @@ +{"chunk_definitions":[],"doc_write_time":1707160381} \ No newline at end of file diff --git a/.Rproj.user/shared/notebooks/patch-chunk-names b/.Rproj.user/shared/notebooks/patch-chunk-names new file mode 100644 index 0000000..e69de29 diff --git a/.Rproj.user/shared/notebooks/paths b/.Rproj.user/shared/notebooks/paths new file mode 100644 index 0000000..4b9eedf --- /dev/null +++ b/.Rproj.user/shared/notebooks/paths @@ -0,0 +1,16 @@ +/home/snu3/Github/seqsender/LICENSE="68DAA5BB" +/home/snu3/Github/seqsender/README.Rmd="45B22F6D" +/home/snu3/Github/seqsender/inst/CITATION="94DDA62B" +/home/snu3/Github/seqsender/vignettes/biosample_submission.Rmd="D111BEFE" +/home/snu3/Github/seqsender/vignettes/compose_installation.Rmd="917137EC" +/home/snu3/Github/seqsender/vignettes/docker_installation.Rmd="67CC0C7B" +/home/snu3/Github/seqsender/vignettes/faqs.Rmd="57260CA3" +/home/snu3/Github/seqsender/vignettes/genbank_submission.Rmd="289371A6" +/home/snu3/Github/seqsender/vignettes/gisaid_cov_submission.Rmd="D5A86BB8" +/home/snu3/Github/seqsender/vignettes/gisaid_flu_submission.Rmd="24F5CC26" +/home/snu3/Github/seqsender/vignettes/gisaid_options.Rmd="EDA2E52D" +/home/snu3/Github/seqsender/vignettes/local_installation.Rmd="5ED2DC74" +/home/snu3/Github/seqsender/vignettes/prerequisites.Rmd="A849AD38" +/home/snu3/Github/seqsender/vignettes/singularity_installation.Rmd="9FCF16B4" +/home/snu3/Github/seqsender/vignettes/sra_options.Rmd="25888850" +/home/snu3/Github/seqsender/vignettes/sra_submission.Rmd="EAFD2BFE" diff --git a/.gitignore b/.gitignore index f29e64d..ebbfb40 100644 --- a/.gitignore +++ b/.gitignore @@ -10,3 +10,9 @@ submit.ready *report.xml test_input/test_metadata.tsv upload_log.csv +*.vscode +*.Rproj +.Rproj.user +.Rhistory +.Rbuildignore +docker-compose-*.yaml diff --git a/DESCRIPTION b/DESCRIPTION index 6a2064f..1a7ddf7 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,7 +1,7 @@ Package: seqsender Type: Package Title: Public Database Submission Pipeline -Version: 1.0.0 +Version: 1.1.0 Authors@R: c( person(given="Dakota", family="Howard", role=c("aut","cre"), @@ -20,17 +20,17 @@ Authors@R: email="ynh4@cdc.gov"), person(given="Brian", family="Lee", role=c("aut"), email="fya1@cdc.gov"), - person(given="Ben", family="Rambo-Martin", role=c("aut"), + person(given="Benjamin", family="Rambo-Martin", role=c("aut"), email="nbx0@cdc.gov", comment=c(ORCID="0000-0002-8591-3954")) ) Description: seqsender is a Python program that is designed to automate the process of generating - necessary submission files (e.g. submission.xml, submission.zip, etc.) - and then bulk uploading them via FTP to NCBI archives such as Genbank, BioSample, and SRA. - Additionally, the program can batch uploading submissions of meta- and sequence-data to GISAID - using their Command Line Interface Tools (e.g., EpiFlu and EpiCoV CLI). - Currently, the pipeline is capable of uploading Influenza A Virus and SARS-COV-2 data. + necessary submission files (e.g. submission.xml, submission.zip, etc.) + and then bulk uploading them via FTP to NCBI archives such as Genbank, BioSample, and SRA. + Additionally, the program can batch uploading submissions of meta- and sequence-data to GISAID + using their Command Line Interface Tools (e.g., EpiFlu and EpiCoV CLI). + Currently, the pipeline is capable of uploading Influenza A Virus and SARS-COV-2 data. RoxygenNote: 7.2.3 -License: GPL-3 + file LICENSE +License: Apache License (== 2.0) + file LICENSE URL: https://github.com/CDCgov/seqsender GITHUB_PAGES: https://cdcgov.github.io/seqsender Docker: cdcgov/seqsender-dev:latest diff --git a/NEWS.md b/NEWS.md index 59f1eeb..b96c15c 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,4 +1,4 @@ - -# MIRA 1.0.0 -* Github Repo: https://github.com/CDCgov/MIRA -* Documentation: https://cdcgov.github.io/MIRA + +# seqsender 1.1.0 +* Github Repo: https://github.com/CDCgov/seqsender +* Documentation: https://cdcgov.github.io/seqsender diff --git a/README.Rmd b/README.Rmd index d3047b9..5de7a98 100644 --- a/README.Rmd +++ b/README.Rmd @@ -68,7 +68,7 @@ Here is a quick look of where to store the downloaded **GISAID CLI** package. ## Requirement Files -Before submitter can perform a submission using ``r program``, make sure the requirement files (such as *config.yaml*, *metadata.csv*, *sequence.fasta*, *raw reads*, etc.) are already prepared and stored in a submission directory of choice. +Before submitters can perform a batch submission using ``r program``, they must make sure the requirement files (such as *config.yaml*, *metadata.csv*, *sequence.fasta*, *raw reads*, etc.) are already prepared and stored in a submission directory of choice. (a) To prep for FLU submissions, select one of the databases below to get started: diff --git a/README.md b/README.md index a44b250..b02dd5c 100644 --- a/README.md +++ b/README.md @@ -1,19 +1,14 @@ - - -

- Public Database Submission Pipeline -

**Beta Version**: 1.1.0. This pipeline is currently in Beta testing, and issues could appear during submission. Please use it at your own risk. -Feedback and suggestions are welcome\! +Feedback and suggestions are welcome! **General Disclaimer**: This repository was created for use by CDC programs to collaborate on public health related projects in support of @@ -36,7 +31,7 @@ organisms in future updates or requests. ## Prerequisites - - **NCBI Submissions** +- **NCBI Submissions** `seqsender` utilizes an UI-Less Data Submission Protocol to bulk upload submission files (e.g., *submission.xml*, *submission.zip*, etc.) to @@ -68,11 +63,11 @@ FTP on the command line. Before attempting to submit a submission using gb-admin@ncbi.nlm.nih.gov to discuss requirements for submissions. -5. Coordinate a NCBI namespace name (**spuid\_namespace**) that will be +5. Coordinate a NCBI namespace name (**spuid_namespace**) that will be used with Submitter Provided Unique Identifiers (**spuid**) in the - submission. The liaison of **spuid\_namespace** and **spuid** is - used to report back assigned accessions as well as for cross-linking - objects within submission. The values of **spuid\_namespace** are up + submission. The liaison of **spuid_namespace** and **spuid** is used + to report back assigned accessions as well as for cross-linking + objects within submission. The values of **spuid_namespace** are up to the submitter to decide but they must be unique and well-coordinated prior to make a submission. For more information about these two fields, see @@ -83,9 +78,7 @@ FTP on the command line. Before attempting to submit a submission using [GENBANK](https://cdcgov.github.io/seqsender/articles/genbank_submission.html#metadata) metadata requirements. - - - - **GISAID Submissions** +- **GISAID Submissions** `seqsender` makes use of GISAID’s Command Line Interface tools to bulk uploading meta- and sequence-data to GISAID databases. Presently, the @@ -119,10 +112,10 @@ package. ## Requirement Files -Before submitter can perform a submission using `seqsender`, make sure -the requirement files (such as *config.yaml*, *metadata.csv*, -*sequence.fasta*, *raw reads*, etc.) are already prepared and stored in -a submission directory of choice. +Before submitters can perform a batch submission using `seqsender`, they +must make sure the requirement files (such as *config.yaml*, +*metadata.csv*, *sequence.fasta*, *raw reads*, etc.) are already +prepared and stored in a submission directory of choice. 1) To prep for FLU submissions, select one of the databases below to get started: @@ -152,14 +145,14 @@ a submission directory of choice. ## Quick Start - - [How to run seqsender - locally](https://cdcgov.github.io/seqsender/articles/local_installation.html) - - [How to run seqsender with - Docker](https://cdcgov.github.io/seqsender/articles/docker_installation.html) - - [How to run seqsender with - Compose](https://cdcgov.github.io/seqsender/articles/compose_installation.html) - - [How to run seqsender with - Singularity](https://cdcgov.github.io/seqsender/articles/singularity_installation.html) +- [How to run seqsender + locally](https://cdcgov.github.io/seqsender/articles/local_installation.html) +- [How to run seqsender with + Docker](https://cdcgov.github.io/seqsender/articles/docker_installation.html) +- [How to run seqsender with + Compose](https://cdcgov.github.io/seqsender/articles/compose_installation.html) +- [How to run seqsender with + Singularity](https://cdcgov.github.io/seqsender/articles/singularity_installation.html) ## Public Domain Standard Notice @@ -237,5 +230,3 @@ repository](https://github.com/CDCgov/template/blob/master/CONTRIBUTING.md), disclaimers](https://github.com/CDCgov/template/blob/master/DISCLAIMER.md), and [code of conduct](https://github.com/CDCgov/template/blob/master/code-of-conduct.md). - -test diff --git a/_pkgdown.yml b/_pkgdown.yml index 8512501..759910d 100644 --- a/_pkgdown.yml +++ b/_pkgdown.yml @@ -9,7 +9,7 @@ news: navbar: structure: - right: [github] + right: [news, github] components: left: - icon: fa-home diff --git a/config/main_config.yaml b/config/main_config.yaml index bb0f952..97393ec 100644 --- a/config/main_config.yaml +++ b/config/main_config.yaml @@ -104,6 +104,10 @@ SUBMISSION_PORTAL: - text: 'E.g., "United Kingdom", "Japan", "China", "United States", etc.' gs-Host: - text: 'Host or source name., E.g. "human", "avian", "chicken", "Anas Acuta", "environment", etc.' + gs-Collection_Month: + - text: 'For incomplete collection dates, use this field instead of "Collection_Date". Month of year: "1" = Jan, "2" = Feb, so forth, "12" = Dec' + gs-Collection_Year: + - text: 'For incomplete collection dates, use this field instead of "Collection_Date". Four digit year as string: e.g. "2023"' gs-Originating_Lab_Id: - text: 'The numeric ID of the sample"s originating laboratory, e.g. "2698"' COV: diff --git a/docs/404.html b/docs/404.html index 22e106b..8e20859 100644 --- a/docs/404.html +++ b/docs/404.html @@ -1,66 +1,27 @@ - - - - + + + + - Page not found (404) • seqsender - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + + + + + + + + + - - - - -
-
- + +
+ + + - - -
+
+
-
+ + - - diff --git a/docs/LICENSE-text.html b/docs/LICENSE-text.html new file mode 100644 index 0000000..5f26c01 --- /dev/null +++ b/docs/LICENSE-text.html @@ -0,0 +1,338 @@ + +License • seqsender + + +
+
+ + + +
+
+ + +
                                 Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/
+
+   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+
+   1. Definitions.
+
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.
+
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.
+
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.
+
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.
+
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.
+
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.
+
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).
+
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.
+
+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."
+
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.
+
+   2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.
+
+   3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.
+
+   4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:
+
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and
+
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and
+
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and
+
+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.
+
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.
+
+   5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.
+
+   6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.
+
+   7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.
+
+   8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.
+
+   9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.
+
+   END OF TERMS AND CONDITIONS
+
+   APPENDIX: How to apply the Apache License to your work.
+
+      To apply the Apache License to your work, attach the following
+      boilerplate notice, with the fields enclosed by brackets "{}"
+      replaced with your own identifying information. (Don't include
+      the brackets!)  The text should be enclosed in the appropriate
+      comment syntax for the file format. We also recommend that a
+      file or class name and description of purpose be included on the
+      same "printed page" as the copyright notice for easier
+      identification within third-party archives.
+
+   Copyright {yyyy} {name of copyright owner}
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+
+ +
+ + + +
+ + + +
+ + + + + + + + diff --git a/docs/articles/biosample_submission.html b/docs/articles/biosample_submission.html index cea5909..7489184 100644 --- a/docs/articles/biosample_submission.html +++ b/docs/articles/biosample_submission.html @@ -19,6 +19,8 @@ + +
+
@@ -140,53 +145,89 @@

NCBI - BioSample

-
-

-Overview

-

BioSample is a database containing aggregated information pertaining to reference samples and samples stored in the European Bioinformatics Institute assay databases.

-

Before one can upload experimental samples to BioSample database using seqsender, they must ensure the requirement files (such as config.yaml, metadata.csv, sequence.fasta, raw reads, etc.) are prepared ahead of time and stored in a submission directory of choice.

+
+

Overview +

+

BioSample is a database containing aggregated +information pertaining to reference samples and samples stored in the European Bioinformatics Institute +assay databases.

+

Before submitters can upload their experimental samples to +BioSample database using seqsender, they +must ensure the requirement files (such as config.yaml, +metadata.csv, sequence.fasta, +raw reads, etc.) are already prepared ahead of time and +stored them in a submission folder of choice (e.g., +submission_name) within a parent submission directory +(e.g., submission_dir). That way seqsender +will be able to scoop up the necessary files in that folder, generate +submission files, and then batch uploading them to the submitting +database of choices.

-
-

-Requirement files

+
+

Requirement files +

-
-
-

-Config file

-

Config file is a yaml file that provides a brief description about the submission and contains user credentials that allow seqsender to authenticate the database prior to upload a submission.

+

A quick look of where to store all of the requirement files

+

+
+

Config file +

+

Config file is a yaml file that provides a brief description about +the submission and contains user credentials that allow +seqsender to authenticate the database prior to upload a +submission.

-

` NOTE:

+

+NOTE:

    -
  • To submit to NCBI only, one can remove the GISAID Submission (b) section from the config file. Vice versa, to submit to GISAID only, just remove the NCBI Submission (a) section.
    +
  • To submit to NCBI only, one can remove the GISAID Submission +(b) section from the config file. Vice versa, to submit to +GISAID only, just remove the NCBI Submission (a) +section.
  • -Submission_Position determines the order of databases in which we will submit to first. For instance, if GISAID is set as Primary, seqsender will submit to GISAID first, then after all samples are assigned with a GISAID accession number, seqsender will proceed to submit to NCBI. This order of submission ensures samples are linked correctly between the two databases.
    +Submission_Position determines the order of +databases in which we will submit to first. For instance, if GISAID is +set as 1, seqsender will submit +to GISAID first, then after all samples are assigned with a GISAID +accession number, seqsender will proceed to +submit to NCBI. This order of submission ensures samples are linked +correctly between the two databases.
  • -Username and Password under the NCBI Submission (b) section are the credentials used to authenticate the NCBI FTP Server (not to mistake with individual NCBI account). See PRE-REQUISITES for more details.
  • +Username and Password under the +NCBI Submission (b) section are the credentials used to +authenticate the NCBI FTP Server (not to mistake with +individual NCBI account). See PRE-REQUISITES +for more details.
-
-

-Metadata file

-

Here is a short description about the fields in the metadata worksheet.

+
+

Metadata file +

+

The metadata worksheet is a comma-delimited (csv) file that contains +required attributes that are useful for the rapid analysis and trace +back of Influenza A Virus or SARS-COV-2 cases.

+

Here is a short description about the fields in the metadata +worksheet.

@@ -210,7 +252,12 @@

organism

@@ -218,7 +265,8 @@

collection_date

@@ -226,7 +274,9 @@

authors

@@ -234,7 +284,9 @@

ncbi-spuid

@@ -242,7 +294,9 @@

ncbi-spuid_namespace

@@ -258,7 +312,8 @@

bs-description

@@ -274,7 +329,12 @@

bs-geo_loc_name

@@ -282,7 +342,8 @@

bs-host

@@ -290,7 +351,11 @@

bs-host_disease

@@ -298,7 +363,8 @@

bs-isolate

@@ -306,7 +372,8 @@

bs-isolation_source

@@ -314,21 +381,30 @@

bs-lat_lon

@@ -202,7 +243,8 @@

sequence_name

-Sequence identifier used in fasta file. This is used to create the fasta file for Genbank or GISAID. +Sequence identifier used in fasta file. This is used to create the fasta +file for Genbank or GISAID.
-The most descriptive organism name for the samples. If relevant, you can search the organism name in the NCBI Taxonomy database. For FLU, organism must be “Influenza A Virus”. For COV, organism must be “Severe acute respiratory syndrome coronavirus 2”. +The most descriptive organism name for the samples. If relevant, you can +search the organism name in the +NCBI +Taxonomy database. For FLU, organism must be “Influenza A +Virus”. For COV, organism must be “Severe acute respiratory +syndrome coronavirus 2”.
-The date on which the sample was collected; must be in the ISO format: YYYY-MM-DD.
For example: 2020-03-25 +The date on which the sample was collected; must be in the ISO format: +YYYY-MM-DD.
For example: 2020-03-25
-Citing authors. List of Last, First Middle, suffix separated by a semicolon “;” E.g.: “Baker, Howard Henry, Jr.; Powell, Earl Alexander, III.;” +Citing authors. List of Last, First Middle, suffix separated by a +semicolon “;” E.g.: “Baker, Howard Henry, Jr.; Powell, Earl Alexander, +III.;”
-Submitter Provided Unique Identifiers. This is used to report back assigned accessions as well as for cross-linking objects within submission. +Submitter Provided Unique Identifiers. This is used to report back +assigned accessions as well as for cross-linking objects within +submission.
-If SPUID is used, spuid_namespace has to be provided. The values of spuid_namespace are from controlled vocabulary and need to be coordinated with NCBI prior to submission. +If SPUID is used, spuid_namespace has to be provided. The values of +spuid_namespace are from controlled vocabulary and need to be +coordinated with NCBI prior to submission.
-A brief description about the sample, e.g. SARS-CoV-2 Sequencing Baseline Constellation. +A brief description about the sample, e.g. SARS-CoV-2 Sequencing +Baseline Constellation.
-Geographical origin of the sample; use the appropriate name from this list. Use a colon to separate the country or ocean from more detailed information about the location, eg “Canada: Vancouver” or “Germany: halfway down Zugspitze, Alps”. Entering multiple localities in one attribute is not allowed. +Geographical origin of the sample; use the appropriate name from +this +list. Use a colon to separate the country or ocean from more +detailed information about the location, eg “Canada: Vancouver” or +“Germany: halfway down Zugspitze, Alps”. Entering multiple localities in +one attribute is not allowed.
-The natural (as opposed to laboratory) host to the organism from which the sample was obtained. Use the full taxonomic name, eg, Homo sapiens. +The natural (as opposed to laboratory) host to the organism from which +the sample was obtained. Use the full taxonomic name, eg, Homo sapiens.
-Name of relevant disease, e.g. Salmonella gastroenteritis. Controlled vocabulary, please see Human Disease Ontology or MeSH +Name of relevant disease, e.g. Salmonella gastroenteritis. Controlled +vocabulary, please see +Human +Disease Ontology or +MeSH
-Identification or description of the specific individual from which this sample was obtained. +Identification or description of the specific individual from which this +sample was obtained.
-Describes the physical, environmental and/or local geographical source of the biological sample from which the sample was derived. +Describes the physical, environmental and/or local geographical source +of the biological sample from which the sample was derived.
-The geographical coordinates of the location where the sample was collected. Specify as degrees latitude and longitude in format “d[d.dddd] N|S d[dd.dddd] W|E”, eg, 38.98 N 77.11 W +The geographical coordinates of the location where the sample was +collected. Specify as degrees latitude and longitude in format +“d[d.dddd] N|S d[dd.dddd] W|E”, eg, 38.98 N 77.11 W


-

NOTE: The prefix of “bs-” is used to identity attributes for BioSample submissions

-

To include additional attributes to BioSample submissions, just append bs- in front of the desired attributes, e.g. bs-host_age, bs-host_sex, etc. See Pathogen.cl.1.0 package for more attributes.

+

NOTE: The prefix of “bs-” is used +to identity attributes for BioSample submissions

+

To include additional attributes to BioSample +submissions, just append bs- in front of the desired +attributes, e.g. bs-host_age, bs-host_sex, etc. See Pathogen.cl.1.0 +package for more attributes.



-
@@ -345,11 +421,13 @@

-

Site built with pkgdown 1.6.1.

+

+

Site built with pkgdown 2.0.7.9000.

@@ -358,5 +436,7 @@

+ + diff --git a/docs/articles/compose_installation.html b/docs/articles/compose_installation.html index cfb2d1d..2b7a522 100644 --- a/docs/articles/compose_installation.html +++ b/docs/articles/compose_installation.html @@ -19,6 +19,8 @@ + +
+
@@ -157,281 +162,407 @@

How to run seqsender with Compose

  • Standard utilities: curl, tar, unzip
  • ADDITIONAL REQUIREMENTS:

    -

    See PRE-REQUISITES and REQUIREMENT FILES before proceeding to the next steps

    -
    -

    -(1) Clone seqsender repo to your $HOME directory

    -
    cd $HOME
    -git clone https://github.com/CDCgov/seqsender.git
    +

    See PRE-REQUISITES +and REQUIREMENT +FILES before proceeding to the next steps

    +
    +

    (1) Clone seqsender repo to your $HOME directory +

    +
    cd $HOME
    +git clone https://github.com/CDCgov/seqsender.git
    -