TCGA数据库
https://portal.gdc.cancer.gov/
https://www.jingege.wang/2024/02/25/tcga-gdc-data-portal-2-0/
https://www.jingege.wang/2022/07/16/tcgar/
Show the code
library(jsonlite)
json <- jsonlite::fromJSON("data/metadata.cart.2024-07-04.json") |> as_tibble()
json |> select(associated_entities)
# A tibble: 536 × 1
associated_entities
<list>
1 <df [1 × 4]>
2 <df [1 × 4]>
3 <df [1 × 4]>
4 <df [1 × 4]>
5 <df [1 × 4]>
6 <df [1 × 4]>
7 <df [1 × 4]>
8 <df [1 × 4]>
9 <df [1 × 4]>
10 <df [1 × 4]>
# ℹ 526 more rows
Show the code
json$associated_entities[[1]]
entity_submitter_id entity_type case_id
1 TCGA-38-7271-01A-11R-2039-07 aliquot 8214a0d1-5e2d-4a7a-acb1-e5580755db83
entity_id
1 8d895d3d-9598-459b-813e-3fc84dd3efe8
Show the code
# sample_id <- map_vec(json$associated_entities,~ .x[,1])
sample_id <- sapply(json$associated_entities,function(x){x[,1]})
file_sample <- tibble(sample_id,file_name=json$file_name)
Show the code
#获取gdc_download文件夹下的所有TSV表达文件的 路径+文件名
count_file <- list.files("data/gdc_download_20240704_131631.625023",pattern = "*.tsv",recursive = TRUE)
#在count_file中分割出文件名
count_file_name <- str_split(count_file,pattern = '/')
count_file_name <- sapply(count_file_name,function(x){x[2]})
计数矩阵
Show the code
matrix = data.frame(matrix(nrow=60660,ncol=0))
for (i in 1:length(count_file)) {
path = paste0('data/gdc_download_20240704_131631.625023/',
count_file[i])
data <- read_tsv(path, col_names = T, skip = 1)
data <- data[-c(1:4), c(1, 4)] |> column_to_rownames(var = "gene_id") #取出ESEMBL_id列1 和 unstranded列 4,即count数据,对应其它数据,第2列为gene symbol
colnames(data) <- file_sample$sample_id[which(file_sample$file_name == count_file_name[i])]
matrix <- cbind(matrix, data)
}
write.csv(matrix,'data/COUNT_matrix.csv',row.names = TRUE)
normal 和 tumor
根据TCGA样本的命名可以区分正常组织和肿瘤样本的测序结果 其中 14、15位置编号大于10 表示正常
Show the code
COUNT_matrix <- read_csv('data/COUNT_matrix.csv') |> column_to_rownames(var = "...1")
str_sub(colnames(COUNT_matrix),14,15) |> table()
表型矩阵
Show the code
json <- jsonlite::fromJSON("data/metadata.cart.2024-07-04.json")
entity_submitter_id <- sapply(json$associated_entities,function(x){x[,1]})
case_id <- sapply(json$associated_entities,function(x){x[,3]})
sample_case <- t(rbind(entity_submitter_id,case_id)) |> as_tibble()
clinical <- read.delim('data/clinical.cart.2024-07-04/clinical.tsv',header = T)
clinical <- clinical |> distinct(case_id,.keep_all = TRUE)
clinical_matrix <- left_join(sample_case,clinical,by = join_by(case_id))
clinical_matrix <- clinical_matrix |> select(-case_id)
miRNA 计数矩阵
Show the code
json <- jsonlite::fromJSON("metadata.cart.2022-09-27.json")
#id <- json$associated_entities[[1]][,1]
sample_id <- sapply(json$associated_entities,function(x){x[,1]})
file_sample <- data.frame(sample_id,file_name=json$file_name)
#获取gdc_download文件夹下的所有miRNA表达文件的 路径+文件名
count_file <- list.files('gdc_download_20220927_150057.906231',pattern = '*quantification.txt',recursive = TRUE)
#在count_file中分割出文件名
count_file_name <- strsplit(count_file,split='/')
count_file_name <- sapply(count_file_name,function(x){x[2]})
matrix = data.frame(matrix(nrow=1881,ncol=0))
for (i in 1:length(count_file)){
path = paste0('gdc_download_20220927_150057.906231//',count_file[i])
data<- read.delim(path,fill = TRUE,header = T,row.names = 1)
data <- data[1] #取出count列(第1列),rpm列(第2列)
colnames(data) <- file_sample$sample_id[which(file_sample$file_name==count_file_name[i])]
matrix <- cbind(matrix,data)
}