# DESCRIPTION: ------------
# I will check which TFs are actually binding _preferrably_ to the methylated
# sequence, based on data from Yin, Y., et al. (2017). Science, 356(6337),
# eaaj2239. https://doi.org/10.1126/science.aaj2239 (prepared in
# 'create_TF_methyl_binding_dataset.R') and data collected from JASPAR 2022
# TFBS hg19 via visualization in ensembl
#
# AUTHOR: Julia Romanowska
# DATE CREATED: 2022-01-06
# DATE MODIFIED:
# SETUP ------------
library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.1 ──
## ✔ ggplot2 3.3.6 ✔ purrr 0.3.4
## ✔ tibble 3.1.7 ✔ dplyr 1.0.9
## ✔ tidyr 1.2.0 ✔ stringr 1.4.0
## ✔ readr 2.1.2 ✔ forcats 0.5.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
library(here)
## here() starts at /home/jro049/REPOS/X-factor-ART
# READ DATA ----------------
tfs_binding_signif_cpgs <- read_delim(
here("DATA", "TFs_binding_to_CpGs_JASPAR_ensembl.dat"),
delim = "\t"
)
## Rows: 14 Columns: 3
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (2): CpG, TF_name
## lgl (1): in_MeDReader
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
tfs_binding_signif_cpgs
## # A tibble: 14 × 3
## CpG TF_name in_MeDReader
## <chr> <chr> <lgl>
## 1 cg25034591 FERD3L TRUE
## 2 cg25034591 STAT3 FALSE
## 3 cg25034591 TFAP2C TRUE
## 4 cg25034591 MEIS1 FALSE
## 5 cg25034591 SMAD2 FALSE
## 6 cg13866977 REL FALSE
## 7 cg13866977 OTX2 TRUE
## 8 cg13866977 TFAP4 TRUE
## 9 cg13866977 NEUROG2 TRUE
## 10 cg13866977 HAND2 TRUE
## 11 cg13866977 SMAD2 FALSE
## 12 cg13866977 MEIS1 FALSE
## 13 cg26175661 JUND TRUE
## 14 cg26175661 NR2C2 TRUE
binding_tf_methyl <- read_delim(
here("DATA", "extracted_lines_clean_all.dat"),
delim = "\t"
)
## Rows: 671 Columns: 4
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (4): TF_name, part_tested, class, note
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
binding_tf_methyl
## # A tibble: 671 × 4
## TF_name part_tested class note
## <chr> <chr> <chr> <chr>
## 1 TFAP2A FL MethylMinus <NA>
## 2 TFAP2B DBD MethylMinus <NA>
## 3 TFAP2C DBD MethylMinus <NA>
## 4 TFAP2E FL MethylMinus <NA>
## 5 GRHL1 DBD MethylPlus C
## 6 TFCP2 DBD MethylPlus C
## 7 TFCP2 FL MethylPlus W
## 8 TFCP2L1 DBD MethylPlus C
## 9 UBP1 DBD MethylPlus C
## 10 YBX1 DBD No CpG <NA>
## # … with 661 more rows
# ANALYSE ---------------
tfs_binding_signif_cpgs <- tfs_binding_signif_cpgs %>%
left_join(binding_tf_methyl)
## Joining, by = "TF_name"
tfs_binding_signif_cpgs
## # A tibble: 16 × 6
## CpG TF_name in_MeDReader part_tested class note
## <chr> <chr> <lgl> <chr> <chr> <chr>
## 1 cg25034591 FERD3L TRUE FL No CpG <NA>
## 2 cg25034591 STAT3 FALSE <NA> <NA> <NA>
## 3 cg25034591 TFAP2C TRUE DBD MethylMinus <NA>
## 4 cg25034591 MEIS1 FALSE <NA> <NA> <NA>
## 5 cg25034591 SMAD2 FALSE <NA> <NA> <NA>
## 6 cg13866977 REL FALSE <NA> <NA> <NA>
## 7 cg13866977 OTX2 TRUE FL No CpG <NA>
## 8 cg13866977 TFAP4 TRUE DBD No CpG <NA>
## 9 cg13866977 TFAP4 TRUE FL No CpG <NA>
## 10 cg13866977 NEUROG2 TRUE DBD No CpG <NA>
## 11 cg13866977 NEUROG2 TRUE FL No CpG W
## 12 cg13866977 HAND2 TRUE DBD No CpG A
## 13 cg13866977 SMAD2 FALSE <NA> <NA> <NA>
## 14 cg13866977 MEIS1 FALSE <NA> <NA> <NA>
## 15 cg26175661 JUND TRUE DBD MethylMinus <NA>
## 16 cg26175661 NR2C2 TRUE DBD No CpG W
# check how many were not matched
tfs_binding_signif_cpgs %>%
filter(is.na(part_tested) & is.na(class))
## # A tibble: 6 × 6
## CpG TF_name in_MeDReader part_tested class note
## <chr> <chr> <lgl> <chr> <chr> <chr>
## 1 cg25034591 STAT3 FALSE <NA> <NA> <NA>
## 2 cg25034591 MEIS1 FALSE <NA> <NA> <NA>
## 3 cg25034591 SMAD2 FALSE <NA> <NA> <NA>
## 4 cg13866977 REL FALSE <NA> <NA> <NA>
## 5 cg13866977 SMAD2 FALSE <NA> <NA> <NA>
## 6 cg13866977 MEIS1 FALSE <NA> <NA> <NA>
# some of those are not matched because the name of TF is just an alias for
# another name, which was matched
tfs_binding_signif_cpgs %>%
filter(is.na(part_tested) & is.na(class) & in_MeDReader)
## # A tibble: 0 × 6
## # … with 6 variables: CpG <chr>, TF_name <chr>, in_MeDReader <lgl>,
## # part_tested <chr>, class <chr>, note <chr>
# how many of those matched have a note?
tfs_binding_signif_cpgs %>%
filter(!is.na(part_tested) & !is.na(class)) %>%
count(note)
## # A tibble: 3 × 2
## note n
## <chr> <int>
## 1 A 1
## 2 W 2
## 3 <NA> 7
tfs_binding_signif_cpgs %>%
filter(!is.na(part_tested) & !is.na(class)) %>%
count(note) %>%
filter(!is.na(note)) %>%
summarise(sum(n))
## # A tibble: 1 × 1
## `sum(n)`
## <int>
## 1 3
# to proceed with a clean dataset, I'll remove those that were not matched
# as well as those that have unreliable measures (i.e., contain something
# in the 'note' column)
tfs_binding_signif_cpgs_clean <- tfs_binding_signif_cpgs %>%
filter((!is.na(part_tested) & !is.na(class)) &
is.na(note))
DT::datatable(tfs_binding_signif_cpgs_clean)
# ANALYSE - how many duplicated entries? ------------
duplicates <- tfs_binding_signif_cpgs_clean %>%
count(TF_name) %>%
filter(n > 1)
duplicates
## # A tibble: 1 × 2
## TF_name n
## <chr> <int>
## 1 TFAP4 2
tfs_binding_signif_cpgs_clean %>%
filter(TF_name %in% duplicates$TF_name) %>%
arrange(TF_name)
## # A tibble: 2 × 6
## CpG TF_name in_MeDReader part_tested class note
## <chr> <chr> <lgl> <chr> <chr> <chr>
## 1 cg13866977 TFAP4 TRUE DBD No CpG <NA>
## 2 cg13866977 TFAP4 TRUE FL No CpG <NA>
# how many of those belong to various classes?
tfs_binding_signif_cpgs_clean %>%
filter(TF_name %in% duplicates$TF_name) %>%
arrange(TF_name) %>%
group_by(TF_name) %>%
count(class) %>%
filter(n == 1)
## # A tibble: 0 × 3
## # Groups: TF_name [0]
## # … with 3 variables: TF_name <chr>, class <chr>, n <int>
# THAT'S GOOD - all the duplicates were basically replicates :)
# let's get rid of the duplicates
# - first: remove the plain duplicates
nrow(tfs_binding_signif_cpgs_clean)
## [1] 7
tfs_binding_signif_cpgs_clean <- tfs_binding_signif_cpgs_clean %>%
distinct()
nrow(tfs_binding_signif_cpgs_clean)
## [1] 7
duplicates <- tfs_binding_signif_cpgs_clean %>%
count(TF_name) %>%
filter(n > 1)
duplicates
## # A tibble: 1 × 2
## TF_name n
## <chr> <int>
## 1 TFAP4 2
# - second: extract those that are not duplicated
tfs_binding_signif_cpgs_clean_part1 <- tfs_binding_signif_cpgs_clean %>%
filter(!(TF_name %in% duplicates$TF_name))
# - third: extract those that were duplicated and get only one copy
# out of there: if there is 'FL' part tested, take it, if not,
# take any
tfs_binding_signif_cpgs_clean_part2 <- tfs_binding_signif_cpgs_clean %>%
filter(TF_name %in% duplicates$TF_name)
tfs_binding_signif_cpgs_clean_part2_fl <- tfs_binding_signif_cpgs_clean_part2 %>%
filter(part_tested == "FL")
tfs_binding_signif_cpgs_clean_part2_fl
## # A tibble: 1 × 6
## CpG TF_name in_MeDReader part_tested class note
## <chr> <chr> <lgl> <chr> <chr> <chr>
## 1 cg13866977 TFAP4 TRUE FL No CpG <NA>
tfs_binding_signif_cpgs_clean_part2 %>%
filter(!(TF_name %in% tfs_binding_signif_cpgs_clean_part2_fl$TF_name)) %>%
arrange(TF_name)
## # A tibble: 0 × 6
## # … with 6 variables: CpG <chr>, TF_name <chr>, in_MeDReader <lgl>,
## # part_tested <chr>, class <chr>, note <chr>
# these are binding in various places, so that's OK
# - finally: merge the two parts
tfs_binding_signif_cpgs_clean <- bind_rows(
tfs_binding_signif_cpgs_clean_part1,
tfs_binding_signif_cpgs_clean_part2_fl,
tfs_binding_signif_cpgs_clean_part2 %>%
filter(!(TF_name %in% tfs_binding_signif_cpgs_clean_part2_fl$TF_name)) %>%
arrange(TF_name),
tfs_binding_signif_cpgs %>%
filter(is.na(part_tested) & is.na(class) & !in_MeDReader)
)
DT::datatable(tfs_binding_signif_cpgs_clean)
# ANALYSE - how many of those TFs can bind to a methylated seq.? ------
# How many various TFs per CpG?
tfs_binding_signif_cpgs_clean %>%
count(CpG)
## # A tibble: 3 × 2
## CpG n
## <chr> <int>
## 1 cg13866977 6
## 2 cg25034591 5
## 3 cg26175661 1
# I will do this separately, per CpG
knitr::kable(
tfs_binding_signif_cpgs_clean %>%
group_by(CpG) %>%
count(class),
caption = "Types of TF binding to a motif containing one of the significant CpGs"
)
Types of TF binding to a motif containing one of the
significant CpGs
cg13866977 |
No CpG |
3 |
cg13866977 |
NA |
3 |
cg25034591 |
MethylMinus |
1 |
cg25034591 |
No CpG |
1 |
cg25034591 |
NA |
3 |
cg26175661 |
MethylMinus |
1 |
# It's the TFs that are in class 'MethylPlus' or 'MethylMinus' that are of
# interest - those react on methylation status of the binding sequence
knitr::kable(
tfs_binding_signif_cpgs_clean %>%
filter(class %in% c("MethylMinus", "MethylPlus")) %>%
arrange(CpG, TF_name)
)
cg25034591 |
TFAP2C |
TRUE |
DBD |
MethylMinus |
NA |
cg26175661 |
JUND |
TRUE |
DBD |
MethylMinus |
NA |
# SAVE ------------
write_delim(
tfs_binding_signif_cpgs_clean %>%
filter(class %in% c("MethylMinus", "MethylPlus")) %>%
arrange(CpG, TF_name),
file = here("DATA", "TFs_methyl_sensitive_signif_CpGs.dat"),
delim = "\t"
)
write_delim(
tfs_binding_signif_cpgs_clean %>%
arrange(CpG, TF_name),
file = here("DATA", "TFs_binding_signif_CpGs_cleaned_all.dat")
)