TCGA

bioinformatics
database
Published

September 24, 2024

Modified

October 14, 2024

TCGA数据库

https://portal.gdc.cancer.gov/

https://www.jingege.wang/2024/02/25/tcga-gdc-data-portal-2-0/

https://www.jingege.wang/2022/07/16/tcgar/

Show the code
library(jsonlite)

json <- jsonlite::fromJSON("data/metadata.cart.2024-07-04.json") |> as_tibble()

json |> select(associated_entities)
# A tibble: 536 × 1
   associated_entities
   <list>             
 1 <df [1 × 4]>       
 2 <df [1 × 4]>       
 3 <df [1 × 4]>       
 4 <df [1 × 4]>       
 5 <df [1 × 4]>       
 6 <df [1 × 4]>       
 7 <df [1 × 4]>       
 8 <df [1 × 4]>       
 9 <df [1 × 4]>       
10 <df [1 × 4]>       
# ℹ 526 more rows
Show the code
json$associated_entities[[1]]
           entity_submitter_id entity_type                              case_id
1 TCGA-38-7271-01A-11R-2039-07     aliquot 8214a0d1-5e2d-4a7a-acb1-e5580755db83
                             entity_id
1 8d895d3d-9598-459b-813e-3fc84dd3efe8
Show the code
# sample_id <- map_vec(json$associated_entities,~ .x[,1])
sample_id <- sapply(json$associated_entities,function(x){x[,1]})


file_sample <- tibble(sample_id,file_name=json$file_name) 
Show the code
#获取gdc_download文件夹下的所有TSV表达文件的 路径+文件名
count_file <- list.files("data/gdc_download_20240704_131631.625023",pattern = "*.tsv",recursive = TRUE)
#在count_file中分割出文件名
count_file_name <- str_split(count_file,pattern = '/') 

count_file_name <- sapply(count_file_name,function(x){x[2]})

计数矩阵

Show the code
matrix = data.frame(matrix(nrow=60660,ncol=0))


for (i in 1:length(count_file)) {
    path = paste0('data/gdc_download_20240704_131631.625023/',
                  count_file[i])
    data <- read_tsv(path, col_names = T, skip = 1)
    data <- data[-c(1:4), c(1, 4)] |> column_to_rownames(var = "gene_id")  #取出ESEMBL_id列1 和 unstranded列 4,即count数据,对应其它数据,第2列为gene symbol
    colnames(data) <- file_sample$sample_id[which(file_sample$file_name == count_file_name[i])]
    
    matrix <- cbind(matrix, data)
}
write.csv(matrix,'data/COUNT_matrix.csv',row.names = TRUE)

normal 和 tumor

根据TCGA样本的命名可以区分正常组织和肿瘤样本的测序结果 其中 14、15位置编号大于10 表示正常

Show the code
COUNT_matrix <- read_csv('data/COUNT_matrix.csv') |> column_to_rownames(var = "...1")


str_sub(colnames(COUNT_matrix),14,15) |> table()

 01  02  11 
481   2  53 

表型矩阵

Show the code
json <- jsonlite::fromJSON("data/metadata.cart.2024-07-04.json")

entity_submitter_id <- sapply(json$associated_entities,function(x){x[,1]})
case_id <- sapply(json$associated_entities,function(x){x[,3]})
sample_case <- t(rbind(entity_submitter_id,case_id)) |> as_tibble()

clinical <- read.delim('data/clinical.cart.2024-07-04/clinical.tsv',header = T)

clinical <- clinical |> distinct(case_id,.keep_all = TRUE)





clinical_matrix <- left_join(sample_case,clinical,by = join_by(case_id))

clinical_matrix <- clinical_matrix |> select(-case_id)

miRNA 计数矩阵

Show the code
json <- jsonlite::fromJSON("metadata.cart.2022-09-27.json")

#id <- json$associated_entities[[1]][,1]
sample_id <- sapply(json$associated_entities,function(x){x[,1]})
file_sample <- data.frame(sample_id,file_name=json$file_name)  

#获取gdc_download文件夹下的所有miRNA表达文件的 路径+文件名
count_file <- list.files('gdc_download_20220927_150057.906231',pattern = '*quantification.txt',recursive = TRUE)
#在count_file中分割出文件名
count_file_name <- strsplit(count_file,split='/')
count_file_name <- sapply(count_file_name,function(x){x[2]})

matrix = data.frame(matrix(nrow=1881,ncol=0))
for (i in 1:length(count_file)){
  path = paste0('gdc_download_20220927_150057.906231//',count_file[i])
  data<- read.delim(path,fill = TRUE,header = T,row.names = 1)
  data <- data[1]   #取出count列(第1列),rpm列(第2列)
  colnames(data) <- file_sample$sample_id[which(file_sample$file_name==count_file_name[i])]
  matrix <- cbind(matrix,data)
}

TCGAWorkflow

https://bioconductor.org/packages/release/workflows/html/TCGAWorkflow.html