# DESCRIPTION: ------------
#  I will check which TFs are actually binding _preferrably_ to the methylated
#  sequence, based on data from Yin, Y., et al. (2017). Science, 356(6337),
#  eaaj2239. https://doi.org/10.1126/science.aaj2239 (prepared in 
#  'create_TF_methyl_binding_dataset.R') and data collected from JASPAR 2022
#  TFBS hg19 via visualization in ensembl
#
# AUTHOR: Julia Romanowska
# DATE CREATED: 2022-01-06
# DATE MODIFIED:

# SETUP ------------
library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.1 ──
## ✔ ggplot2 3.3.6     ✔ purrr   0.3.4
## ✔ tibble  3.1.7     ✔ dplyr   1.0.9
## ✔ tidyr   1.2.0     ✔ stringr 1.4.0
## ✔ readr   2.1.2     ✔ forcats 0.5.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
library(here)
## here() starts at /home/jro049/REPOS/X-factor-ART
# READ DATA ----------------
tfs_binding_signif_cpgs <- read_delim(
  here("DATA", "TFs_binding_to_CpGs_JASPAR_ensembl.dat"),
  delim = "\t"
)
## Rows: 14 Columns: 3
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (2): CpG, TF_name
## lgl (1): in_MeDReader
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
tfs_binding_signif_cpgs
## # A tibble: 14 × 3
##    CpG        TF_name in_MeDReader
##    <chr>      <chr>   <lgl>       
##  1 cg25034591 FERD3L  TRUE        
##  2 cg25034591 STAT3   FALSE       
##  3 cg25034591 TFAP2C  TRUE        
##  4 cg25034591 MEIS1   FALSE       
##  5 cg25034591 SMAD2   FALSE       
##  6 cg13866977 REL     FALSE       
##  7 cg13866977 OTX2    TRUE        
##  8 cg13866977 TFAP4   TRUE        
##  9 cg13866977 NEUROG2 TRUE        
## 10 cg13866977 HAND2   TRUE        
## 11 cg13866977 SMAD2   FALSE       
## 12 cg13866977 MEIS1   FALSE       
## 13 cg26175661 JUND    TRUE        
## 14 cg26175661 NR2C2   TRUE
binding_tf_methyl <- read_delim(
  here("DATA", "extracted_lines_clean_all.dat"),
  delim = "\t"
)
## Rows: 671 Columns: 4
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (4): TF_name, part_tested, class, note
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
binding_tf_methyl
## # A tibble: 671 × 4
##    TF_name part_tested class       note 
##    <chr>   <chr>       <chr>       <chr>
##  1 TFAP2A  FL          MethylMinus <NA> 
##  2 TFAP2B  DBD         MethylMinus <NA> 
##  3 TFAP2C  DBD         MethylMinus <NA> 
##  4 TFAP2E  FL          MethylMinus <NA> 
##  5 GRHL1   DBD         MethylPlus  C    
##  6 TFCP2   DBD         MethylPlus  C    
##  7 TFCP2   FL          MethylPlus  W    
##  8 TFCP2L1 DBD         MethylPlus  C    
##  9 UBP1    DBD         MethylPlus  C    
## 10 YBX1    DBD         No CpG      <NA> 
## # … with 661 more rows
# ANALYSE ---------------
tfs_binding_signif_cpgs <- tfs_binding_signif_cpgs %>%
  left_join(binding_tf_methyl)
## Joining, by = "TF_name"
tfs_binding_signif_cpgs
## # A tibble: 16 × 6
##    CpG        TF_name in_MeDReader part_tested class       note 
##    <chr>      <chr>   <lgl>        <chr>       <chr>       <chr>
##  1 cg25034591 FERD3L  TRUE         FL          No CpG      <NA> 
##  2 cg25034591 STAT3   FALSE        <NA>        <NA>        <NA> 
##  3 cg25034591 TFAP2C  TRUE         DBD         MethylMinus <NA> 
##  4 cg25034591 MEIS1   FALSE        <NA>        <NA>        <NA> 
##  5 cg25034591 SMAD2   FALSE        <NA>        <NA>        <NA> 
##  6 cg13866977 REL     FALSE        <NA>        <NA>        <NA> 
##  7 cg13866977 OTX2    TRUE         FL          No CpG      <NA> 
##  8 cg13866977 TFAP4   TRUE         DBD         No CpG      <NA> 
##  9 cg13866977 TFAP4   TRUE         FL          No CpG      <NA> 
## 10 cg13866977 NEUROG2 TRUE         DBD         No CpG      <NA> 
## 11 cg13866977 NEUROG2 TRUE         FL          No CpG      W    
## 12 cg13866977 HAND2   TRUE         DBD         No CpG      A    
## 13 cg13866977 SMAD2   FALSE        <NA>        <NA>        <NA> 
## 14 cg13866977 MEIS1   FALSE        <NA>        <NA>        <NA> 
## 15 cg26175661 JUND    TRUE         DBD         MethylMinus <NA> 
## 16 cg26175661 NR2C2   TRUE         DBD         No CpG      W
# check how many were not matched
tfs_binding_signif_cpgs %>%
  filter(is.na(part_tested) & is.na(class))
## # A tibble: 6 × 6
##   CpG        TF_name in_MeDReader part_tested class note 
##   <chr>      <chr>   <lgl>        <chr>       <chr> <chr>
## 1 cg25034591 STAT3   FALSE        <NA>        <NA>  <NA> 
## 2 cg25034591 MEIS1   FALSE        <NA>        <NA>  <NA> 
## 3 cg25034591 SMAD2   FALSE        <NA>        <NA>  <NA> 
## 4 cg13866977 REL     FALSE        <NA>        <NA>  <NA> 
## 5 cg13866977 SMAD2   FALSE        <NA>        <NA>  <NA> 
## 6 cg13866977 MEIS1   FALSE        <NA>        <NA>  <NA>
# some of those are not matched because the name of TF is just an alias for
#   another name, which was matched
tfs_binding_signif_cpgs %>%
  filter(is.na(part_tested) & is.na(class) & in_MeDReader)
## # A tibble: 0 × 6
## # … with 6 variables: CpG <chr>, TF_name <chr>, in_MeDReader <lgl>,
## #   part_tested <chr>, class <chr>, note <chr>
# how many of those matched have a note?
tfs_binding_signif_cpgs %>%
  filter(!is.na(part_tested) & !is.na(class)) %>%
  count(note)
## # A tibble: 3 × 2
##   note      n
##   <chr> <int>
## 1 A         1
## 2 W         2
## 3 <NA>      7
tfs_binding_signif_cpgs %>%
  filter(!is.na(part_tested) & !is.na(class)) %>%
  count(note) %>%
  filter(!is.na(note)) %>%
  summarise(sum(n))
## # A tibble: 1 × 1
##   `sum(n)`
##      <int>
## 1        3
# to proceed with a clean dataset, I'll remove those that were not matched
#  as well as those that have unreliable measures (i.e., contain something
#  in the 'note' column)
tfs_binding_signif_cpgs_clean <- tfs_binding_signif_cpgs %>%
  filter((!is.na(part_tested) & !is.na(class)) &
          is.na(note))
DT::datatable(tfs_binding_signif_cpgs_clean)
# ANALYSE - how many duplicated entries? ------------
duplicates <- tfs_binding_signif_cpgs_clean %>%
  count(TF_name) %>%
  filter(n > 1)
duplicates
## # A tibble: 1 × 2
##   TF_name     n
##   <chr>   <int>
## 1 TFAP4       2
tfs_binding_signif_cpgs_clean %>%
  filter(TF_name %in% duplicates$TF_name) %>%
  arrange(TF_name)
## # A tibble: 2 × 6
##   CpG        TF_name in_MeDReader part_tested class  note 
##   <chr>      <chr>   <lgl>        <chr>       <chr>  <chr>
## 1 cg13866977 TFAP4   TRUE         DBD         No CpG <NA> 
## 2 cg13866977 TFAP4   TRUE         FL          No CpG <NA>
# how many of those belong to various classes?
tfs_binding_signif_cpgs_clean %>%
  filter(TF_name %in% duplicates$TF_name) %>%
  arrange(TF_name) %>%
  group_by(TF_name) %>%
  count(class) %>%
  filter(n == 1)
## # A tibble: 0 × 3
## # Groups:   TF_name [0]
## # … with 3 variables: TF_name <chr>, class <chr>, n <int>
# THAT'S GOOD - all the duplicates were basically replicates :)
# let's get rid of the duplicates

#  - first: remove the plain duplicates
nrow(tfs_binding_signif_cpgs_clean)
## [1] 7
tfs_binding_signif_cpgs_clean <- tfs_binding_signif_cpgs_clean %>%
  distinct()
nrow(tfs_binding_signif_cpgs_clean)
## [1] 7
duplicates <- tfs_binding_signif_cpgs_clean %>%
  count(TF_name) %>%
  filter(n > 1)
duplicates
## # A tibble: 1 × 2
##   TF_name     n
##   <chr>   <int>
## 1 TFAP4       2
#  - second: extract those that are not duplicated
tfs_binding_signif_cpgs_clean_part1 <- tfs_binding_signif_cpgs_clean %>%
  filter(!(TF_name %in% duplicates$TF_name))
#  - third: extract those that were duplicated and get only one copy
#           out of there: if there is 'FL' part tested, take it, if not,
#           take any
tfs_binding_signif_cpgs_clean_part2 <- tfs_binding_signif_cpgs_clean %>%
  filter(TF_name %in% duplicates$TF_name)
tfs_binding_signif_cpgs_clean_part2_fl <- tfs_binding_signif_cpgs_clean_part2 %>%
  filter(part_tested == "FL")
tfs_binding_signif_cpgs_clean_part2_fl
## # A tibble: 1 × 6
##   CpG        TF_name in_MeDReader part_tested class  note 
##   <chr>      <chr>   <lgl>        <chr>       <chr>  <chr>
## 1 cg13866977 TFAP4   TRUE         FL          No CpG <NA>
tfs_binding_signif_cpgs_clean_part2 %>%
  filter(!(TF_name %in% tfs_binding_signif_cpgs_clean_part2_fl$TF_name)) %>%
  arrange(TF_name)
## # A tibble: 0 × 6
## # … with 6 variables: CpG <chr>, TF_name <chr>, in_MeDReader <lgl>,
## #   part_tested <chr>, class <chr>, note <chr>
# these are binding in various places, so that's OK

#  - finally: merge the two parts
tfs_binding_signif_cpgs_clean <- bind_rows(
  tfs_binding_signif_cpgs_clean_part1,
  tfs_binding_signif_cpgs_clean_part2_fl,
  tfs_binding_signif_cpgs_clean_part2 %>%
    filter(!(TF_name %in% tfs_binding_signif_cpgs_clean_part2_fl$TF_name)) %>%
    arrange(TF_name),
  tfs_binding_signif_cpgs %>%
    filter(is.na(part_tested) & is.na(class) & !in_MeDReader)
)
DT::datatable(tfs_binding_signif_cpgs_clean)
# ANALYSE - how many of those TFs can bind to a methylated seq.? ------
# How many various TFs per CpG?
tfs_binding_signif_cpgs_clean %>%
  count(CpG)
## # A tibble: 3 × 2
##   CpG            n
##   <chr>      <int>
## 1 cg13866977     6
## 2 cg25034591     5
## 3 cg26175661     1
# I will do this separately, per CpG
knitr::kable(
  tfs_binding_signif_cpgs_clean %>%
    group_by(CpG) %>%
    count(class),
  caption = "Types of TF binding to a motif containing one of the significant CpGs"
)
Types of TF binding to a motif containing one of the significant CpGs
CpG class n
cg13866977 No CpG 3
cg13866977 NA 3
cg25034591 MethylMinus 1
cg25034591 No CpG 1
cg25034591 NA 3
cg26175661 MethylMinus 1
# It's the TFs that are in class 'MethylPlus' or 'MethylMinus' that are of
#  interest - those react on methylation status of the binding sequence
knitr::kable(
  tfs_binding_signif_cpgs_clean %>%
    filter(class %in% c("MethylMinus", "MethylPlus")) %>%
    arrange(CpG, TF_name)
)
CpG TF_name in_MeDReader part_tested class note
cg25034591 TFAP2C TRUE DBD MethylMinus NA
cg26175661 JUND TRUE DBD MethylMinus NA
# SAVE ------------
write_delim(
  tfs_binding_signif_cpgs_clean %>%
    filter(class %in% c("MethylMinus", "MethylPlus")) %>%
    arrange(CpG, TF_name),
  file = here("DATA", "TFs_methyl_sensitive_signif_CpGs.dat"),
  delim = "\t"
)

write_delim(
  tfs_binding_signif_cpgs_clean %>%
    arrange(CpG, TF_name),
  file = here("DATA", "TFs_binding_signif_CpGs_cleaned_all.dat")
)