Chapter 3 Methods
We describe our methods in this chapter.
Set Environment QC/multiqc of raw Reads Trimming with Trimmomatic QC/multiqc of trimmed Reads Genome Index and Aligning with STAR
Raw Data file names original:
mPDL_RNA7D_Ko2_S1_L001_R2_001_2.fastq.gz
mPDL_RNA7D_Ko1_S1_L001_R1_001_1.fastq.gz
mPDL_RNA7D_Ko1_S1_L001_R2_001_2.fastq.gz
mPDL_RNA7D_Ko2_S1_L001_R1_001_1.fastq.gz
Change (for easy management) to: mPDL_RNA7D_Ko1_1.fastq.gz mPDL_RNA7D_Ko1_2.fastq.gz mPDL_RNA7D_Ko2_1.fastq.gz mPDL_RNA7D_Ko2_2.fastq.gz
Seems like original raw data was already splited
# Create the cat file with the file names.
cat > filenames.txt
#Type or copy paste the name of the files to be analyzed.
# In last line press enter and then control + D to save the file names in the cat file.
#Check the list
head filenames.txt
# Since already spliced not need to run this section.
# In conda environment
mkdir rawReads
cd rawReads
# Download files from webpage source. (one by one)
fastq-dump --gzip --split-file mPDL_RNA7D_Ko1 &
# The & indicates to run in the background. To see the jobs in the bkg.
jobs
# Download files from webpage source. (all at once). Better to do overnight.
cat filenames.txt | parallel "fastq-dump --gzip --split-file {}"
# files will be Forward (1) Reverse (2) or Paired reads.
conda install multiqc# Some git preferences for git contributions.
git status
git config --list
# set git email
git config user.email "email@example.com"
# confirm emal
git config --global user.email
# For the rna-seq_test environment.
. /usr/local/anaconda3/bin/activate && conda activate /usr/local/anaconda3/envs/rna-seq_test
cd "/Volumes/HD-PCFSU3-A/Experiments Data/Genewiz/Trial02"
## Run FAST QC
mkdir rawFastQC
fastqc -t 64 rawReads/*fastq.gz -o rawFastQC
fastqc rawReads/*fastq.gz -o rawFastQC
cd rawFastQC
ls
## Run multyQC
multiqc .
ls
cd..
## Trimming
cat > filenames2.txt
mPDL_RNA7D_Ko1
mPDL_RNA7D_Ko2
cat filenames2.txt
mkdir trimmedReads
# Pull TrueSeq to folder.
cp /usr/local/anaconda3/envs/rna-seq_test/share/trimmomatic-0.39-2/adapters/TruSeq3-PE-2.fa .
# Run Trimmomatic
# Original (one by one)
trimmomatic PE -threads 5 rawReads/mPDL_RNA7D_Ko1_1.fastq.gz rawReads/mPDL_RNA7D_Ko1_2.fastq.gz -baseout trimmedReads/mPDL_RNA7D_Ko1 ILLUMINACLIP:truSeq-PE-2.fa:2:30:10:2:keepBothReads LEADING:3 TRAILING:3 MINLEN:36 2> trimmedReads/mPDL_RNA7D_Ko1trimming.log
#???????????? HOw to determing leading, trailing and minimun length
# Original (all automatically)
cat filenames2.txt | parallel -j 4 "trimmomatic PE -threads 5 rawReads/{}_1.fastq.gz rawReads/{}_2.fastq.gz -baseout trimmedReads/{} ILLUMINACLIP:truSeq-PE-2.fa:2:30:10:2:keepBothReads LEADING:3 TRAILING:3 MINLEN:36 2> trimmedReads/{}trimming.log"
# Check if jobs are running
jobs
# Problems: Takes a long time and sometimes it stops.
# The generic trimmomatic command:
java -jar trimmomatic-0.39.jar PE inputforward.fq.gz inputreverse.fq.gz outputforwardpaired.fq.gz outputforwardunpaired.fq.gz outputreversepaired.fq.gz outputreverseunpaired.fq.gz ILLUMINACLIP:TruSeq3-PE.fa:2:30:10:2:keepBothReads LEADING:3 TRAILING:3 MINLEN:36
# Try 03 SUCCESS!!!?
trimmomatic PE rawReads/mPDL_RNA7D_Ko1_S1_L001_R1_001_1.fastq.gz rawReads/mPDL_RNA7D_Ko1_S1_L001_R2_001_2.fastq.gz -baseout trimmedReads/mPDL_RNA7D_Ko1_S1_L001 ILLUMINACLIP:TruSeq3-PE-2.fa:2:30:10:2:keepBothReads LEADING:3 TRAILING:3 MINLEN:36 2> trimmedReads/mPDL_RNA7D_Ko1_S1_L001trimming.log
# Sample 1 Success!!!!???? error but TrimmomaticPE: Completed successfully message. # error in TruSeq3-PE.fa (No such file or directory)... Change to TruSeq3-PE-2.fa (not tested yet)
trimmomatic PE rawReads/mPDL_RNA7D_Ko1_S1_L001_R1_001_1.fastq.gz rawReads/mPDL_RNA7D_Ko1_S1_L001_R2_001_2.fastq.gz -baseout trimmedReads/mPDL_RNA7D_Ko1_S1_L001_R1_001 ILLUMINACLIP:TruSeq3-PE-2.fa:2:30:10:2:keepBothReads LEADING:3 TRAILING:3 MINLEN:36 2> trimmedReads/mPDL_RNA7D_Ko1_S1_L001trimming.log
# Sample 2 Success!!!!???? error but TrimmomaticPE: Completed successfully message. # error in TruSeq3-PE.fa (No such file or directory)... Change to TruSeq3-PE-2.fa (not tested yet)
trimmomatic PE rawReads/mPDL_RNA7D_Ko2_S1_L001_R1_001_1.fastq.gz rawReads/mPDL_RNA7D_Ko2_S1_L001_R2_001_2.fastq.gz -baseout trimmedReads/mPDL_RNA7D_Ko2_S1_L001_R2_001 ILLUMINACLIP:TruSeq3-PE-2.fa:2:30:10:2:keepBothReads LEADING:3 TRAILING:3 MINLEN:36 2> trimmedReads/mPDL_RNA7D_Ko2_S1_L001trimming.log
# Pipe and loop for all files.
cat filenames.txt | parallel -j 4 "trimmomatic PE rawReads/{}_1.fastq.gz rawReads/{}_2.fastq.gz -baseout trimmedReads/{} ILLUMINACLIP:TruSeq3-PE-2.fa:2:30:10:2keepBothReads LEADING:3 TRAILING:3 MINLEN:36 2> trimmedReads/{}trimming.log" #Not successful.
# In TrimmedRead run multiqc
ls
multiqc .
# fastQC of trimmed files again?
mkdir trimmedFastQC
# rm -d trimmedFastQC # Make sure folder is in main folder, if not delete
# Run fastqc... Trimmed files have no file extension. Why???
fastqc trimmedReads/* -o trimmedFastQC
# .zip and qc html files will be created. Run multiqc
cd trimmedFastQC
multiqc .Alignment with STAR
- Download the Mouse genome form the NCBI page. See 1.3.4
- Unzip the tar.gz file. FYI: This processes require a long time.
# Check the correct environment.
. /usr/local/anaconda3/bin/activate && conda activate /usr/local/anaconda3/envs/rna-seq_test
# Check the correct folder/directory.
cd "/Volumes/HD-PCFSU3-A/Experiments Data/Genewiz/60-499583816_60-501009934/Result/Mus Musculus/Mus_musculus_NCBI_build37.2.tar.gz"
#Data/Genewiz/60-499583816_60-501009934/Result/Mus Musculus/Mus_musculus_NCBI_build37.2.tar.gz: Not a directory
cd "/Volumes/HD-PCFSU3-A/Experiments Data/Genewiz/60-499583816_60-501009934/Result/Mus Musculus/"
# Unzip file
tar xvfz Mus_musculus_NCBI_build37.2.tar.gz- Confirm that STAR is installed if not Install STAR.
# Make sure of enviroment.
STAR --help
#If not found install.
conda install STAR
# Make sure is from "bioconda". Preceed([y]/n)? message.
y
# Check again. Long list of flags.
STAR --help
# Create directory for starIndex.
mkdir starIndex- Run STAR
1st. Index Mus_Musculus whole genome, 2nd. Index Samples against Mus_Musculus Index.
# Make sure that the unzip folder of Mus_musculus_NCBI_build37.2.tar.gz (Mus_musculus) is in the same directory as environment/analysis folder in this case:
cd "/Volumes/HD-PCFSU3-A/Experiments Data/Genewiz/Trial02/"
# Run STAR. Takes long time. wait until ..... finished successfully message appears. if not rerun again.
STAR --runThreadN 64 --runMode genomeGenerate --genomeDir starIndex --genomeFastaFiles Mus_musculus/NCBI/build37.2/Sequence/WholeGenomeFasta/genome.fa --sjdbGTFfile Mus_musculus/NCBI/build37.2/Annotation/Genes/genes.gtf- Align reads
# create a new directory
mkdir starAlignedGit token