TCGA数据库

Show the code

library(jsonlite)

json <- jsonlite::fromJSON("data/metadata.cart.2024-07-04.json") |> as_tibble()

json |> select(associated_entities)

# A tibble: 536 × 1
   associated_entities
   <list>             
 1 <df [1 × 4]>       
 2 <df [1 × 4]>       
 3 <df [1 × 4]>       
 4 <df [1 × 4]>       
 5 <df [1 × 4]>       
 6 <df [1 × 4]>       
 7 <df [1 × 4]>       
 8 <df [1 × 4]>       
 9 <df [1 × 4]>       
10 <df [1 × 4]>       
# ℹ 526 more rows

Show the code

json$associated_entities[[1]]

           entity_submitter_id entity_type                              case_id
1 TCGA-38-7271-01A-11R-2039-07     aliquot 8214a0d1-5e2d-4a7a-acb1-e5580755db83
                             entity_id
1 8d895d3d-9598-459b-813e-3fc84dd3efe8

Show the code

# sample_id <- map_vec(json$associated_entities,~ .x[,1])
sample_id <- sapply(json$associated_entities,function(x){x[,1]})


file_sample <- tibble(sample_id,file_name=json$file_name)

Show the code

#获取gdc_download文件夹下的所有TSV表达文件的 路径+文件名
count_file <- list.files("data/gdc_download_20240704_131631.625023",pattern = "*.tsv",recursive = TRUE)
#在count_file中分割出文件名
count_file_name <- str_split(count_file,pattern = '/') 

count_file_name <- sapply(count_file_name,function(x){x[2]})

计数矩阵

Show the code

matrix = data.frame(matrix(nrow=60660,ncol=0))


for (i in 1:length(count_file)) {
    path = paste0('data/gdc_download_20240704_131631.625023/',
                  count_file[i])
    data <- read_tsv(path, col_names = T, skip = 1)
    data <- data[-c(1:4), c(1, 4)] |> column_to_rownames(var = "gene_id")  #取出ESEMBL_id列1 和 unstranded列 4，即count数据，对应其它数据，第2列为gene symbol
    colnames(data) <- file_sample$sample_id[which(file_sample$file_name == count_file_name[i])]
    
    matrix <- cbind(matrix, data)
}
write.csv(matrix,'data/COUNT_matrix.csv',row.names = TRUE)

normal 和 tumor

根据TCGA样本的命名可以区分正常组织和肿瘤样本的测序结果其中 14、15位置编号大于10 表示正常

Show the code

COUNT_matrix <- read_csv('data/COUNT_matrix.csv') |> column_to_rownames(var = "...1")


str_sub(colnames(COUNT_matrix),14,15) |> table()


 01  02  11 
481   2  53

表型矩阵

Show the code

json <- jsonlite::fromJSON("data/metadata.cart.2024-07-04.json")

entity_submitter_id <- sapply(json$associated_entities,function(x){x[,1]})
case_id <- sapply(json$associated_entities,function(x){x[,3]})
sample_case <- t(rbind(entity_submitter_id,case_id)) |> as_tibble()

clinical <- read.delim('data/clinical.cart.2024-07-04/clinical.tsv',header = T)

clinical <- clinical |> distinct(case_id,.keep_all = TRUE)





clinical_matrix <- left_join(sample_case,clinical,by = join_by(case_id))

clinical_matrix <- clinical_matrix |> select(-case_id)

miRNA 计数矩阵

Show the code

json <- jsonlite::fromJSON("metadata.cart.2022-09-27.json")

#id <- json$associated_entities[[1]][,1]
sample_id <- sapply(json$associated_entities,function(x){x[,1]})
file_sample <- data.frame(sample_id,file_name=json$file_name)  

#获取gdc_download文件夹下的所有miRNA表达文件的 路径+文件名
count_file <- list.files('gdc_download_20220927_150057.906231',pattern = '*quantification.txt',recursive = TRUE)
#在count_file中分割出文件名
count_file_name <- strsplit(count_file,split='/')
count_file_name <- sapply(count_file_name,function(x){x[2]})

matrix = data.frame(matrix(nrow=1881,ncol=0))
for (i in 1:length(count_file)){
  path = paste0('gdc_download_20220927_150057.906231//',count_file[i])
  data<- read.delim(path,fill = TRUE,header = T,row.names = 1)
  data <- data[1]   #取出count列（第1列），rpm列（第2列）
  colnames(data) <- file_sample$sample_id[which(file_sample$file_name==count_file_name[i])]
  matrix <- cbind(matrix,data)
}

TCGAWorkflow

https://bioconductor.org/packages/release/workflows/html/TCGAWorkflow.html