# DESCRIPTION ---------------
#   Get the data extracted from Yin, Y., et al. (2017). 'Impact of cytosine
#   methylation on DNA binding specificities of human transcription factors.'
#   Science, 356(6337), eaaj2239. https://doi.org/10.1126/science.aaj2239 .
#   The supplementary PDF file (Databases_S1_for_submission.pdf) includes
#   all the measured TFs with their classification based on whether their
#   binding sequence includes a CpG or not and based on whether they bind
#   better to the methylated CpG or not.
#   PDF is not a good way of keeping that information; I've exported the text
#   only into DATA/yin_science_2017_TF_types.txt and then ran the script
#   'extract_data_from_yin.sh' to create a file with one TF per line
#   (DATA/extracted_lines_clean.txt).
#   I can analyse this data from there.
#
#  From the supplementary info - some details on special marking:
#    *A*      Signal from Another TF family: Raw sequence data contains reads
#             belonging to another TF family. These are not included to the
#             model presented as the seed excludes them. Information provided
#             to aid bioinformatic analyses of full data.
#    *B*      CG frequency Below cut-off: CG containing subsequences were
#             detected, but frequency of the CG was below the 10% cut-off for
#             calling the effect of methylation on binding.
#    *C*      Call based on bisulfite SELEX: kmer analysis does not detect
#             effect, call is based on bisulfite SELEX. This is caused mostly
#             by absolute requirement for CG in site, in which case the small
#             fraction of unmethylated CG containing ligands that is left after
#             MSssI treatment is enriched.
#    *M*      Motif-dependent effect: TF has multiple motifs, with different
#             effect of CG methylation.
#    *N*      Weak enrichment with Non-specific signal: Enrichment for the motif
#             was weak, there is also signal for enrichment of subsequences
#             that enrich weakly in many weak or failed experiments.
#    *S*      Suboptimal k-mer length: kmer is too short or long compared to
#             the corresponding motif. 8mer analysis does not detect the CG or
#             doesn't detect the CG at the correct position.
#    *U*      Unique reads used to detect signal: Selection library has low
#             diversity after selection, unique reads used to detect signal.
#    *W*      Weak: Obtained motif has relatively low number of reads or weak
#             enrichment.
#
#   Additionally, the abbreviations after the name of the TF mean:
#    *FL*    full length of the protein was tested
#    *DBD*   DNA-binding domain
#    *eDBD*  extended DNA-binding domain
#
# AUTHOR: Julia Romanowska
# DATE CREATED: 2022-01-06
# DATE MODIFIED:

# SETUP -----------------
library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.1 ──
## ✓ ggplot2 3.3.5     ✓ purrr   0.3.4
## ✓ tibble  3.1.6     ✓ dplyr   1.0.7
## ✓ tidyr   1.1.3     ✓ stringr 1.4.0
## ✓ readr   2.0.0     ✓ forcats 0.5.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
library(here)
## here() starts at /scratch/Naukowe/START_project/X-chrom
# READ DATA -----------------------
extracted_lines <- read_delim(
  here("DATA", "extracted_lines_clean.txt"),
  delim = ":",
  col_names = FALSE
)
## Rows: 671 Columns: 2
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ":"
## chr (2): X1, X2
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
extracted_lines
## # A tibble: 671 × 2
##    X1               X2               
##    <chr>            <chr>            
##  1 "TFAP2A _ FL "   " MethylMinus"   
##  2 "TFAP2B _ DBD "  " MethylMinus"   
##  3 "TFAP2C _ DBD "  " MethylMinus"   
##  4 "TFAP2E _ FL "   " MethylMinus"   
##  5 "GRHL1 _ DBD "   " MethylPlus *C*"
##  6 "TFCP2 _ DBD "   " MethylPlus *C*"
##  7 "TFCP2 _ FL "    " MethylPlus *W*"
##  8 "TFCP2L1 _ DBD " " MethylPlus *C*"
##  9 "UBP1 _ DBD "    " MethylPlus *C*"
## 10 "YBX1 _ DBD "    " No CpG"        
## # … with 661 more rows
colnames(extracted_lines) <- c("TF_full_name", "class_full")
extracted_lines
## # A tibble: 671 × 2
##    TF_full_name     class_full       
##    <chr>            <chr>            
##  1 "TFAP2A _ FL "   " MethylMinus"   
##  2 "TFAP2B _ DBD "  " MethylMinus"   
##  3 "TFAP2C _ DBD "  " MethylMinus"   
##  4 "TFAP2E _ FL "   " MethylMinus"   
##  5 "GRHL1 _ DBD "   " MethylPlus *C*"
##  6 "TFCP2 _ DBD "   " MethylPlus *C*"
##  7 "TFCP2 _ FL "    " MethylPlus *W*"
##  8 "TFCP2L1 _ DBD " " MethylPlus *C*"
##  9 "UBP1 _ DBD "    " MethylPlus *C*"
## 10 "YBX1 _ DBD "    " No CpG"        
## # … with 661 more rows
# TIDY ------------------
# remove all the whitespace characters and divide into extra columns
extracted_lines_clean <- extracted_lines %>%
  separate(
    col = TF_full_name,
    into = c("TF_name", "part_tested"),
    sep = "_"
  ) %>%
  separate(
    col = class_full,
    into = c("class", "note"),
    sep = "\\*",
    extra = "merge",
    fill = "right"
  ) %>%
  mutate(across(everything(), str_trim)) %>%
  mutate(across(c(class, part_tested), as.factor))
extracted_lines_clean
## # A tibble: 671 × 4
##    TF_name part_tested class       note 
##    <chr>   <fct>       <fct>       <chr>
##  1 TFAP2A  FL          MethylMinus <NA> 
##  2 TFAP2B  DBD         MethylMinus <NA> 
##  3 TFAP2C  DBD         MethylMinus <NA> 
##  4 TFAP2E  FL          MethylMinus <NA> 
##  5 GRHL1   DBD         MethylPlus  C*   
##  6 TFCP2   DBD         MethylPlus  C*   
##  7 TFCP2   FL          MethylPlus  W*   
##  8 TFCP2L1 DBD         MethylPlus  C*   
##  9 UBP1    DBD         MethylPlus  C*   
## 10 YBX1    DBD         No CpG      <NA> 
## # … with 661 more rows
skimr::skim(extracted_lines_clean)
Data summary
Name extracted_lines_clean
Number of rows 671
Number of columns 4
_______________________
Column type frequency:
character 2
factor 2
________________________
Group variables None

Variable type: character

skim_variable n_missing complete_rate min max empty n_unique whitespace
TF_name 0 1.00 1 7 0 542 0
note 412 0.39 2 3 0 15 0

Variable type: factor

skim_variable n_missing complete_rate ordered n_unique top_counts
part_tested 0 1 FALSE 2 DBD: 444, FL: 227
class 0 1 FALSE 9 Met: 221, No : 205, Met: 147, Lit: 39
# check the various classes
#  - TF name
extracted_lines_clean %>%
  count(TF_name) %>%
  count(n)
## Storing counts in `nn`, as `n` already present in input
## ℹ Use `name = "new_name"` to pick a new name.
## # A tibble: 2 × 2
##       n    nn
##   <int> <int>
## 1     1   413
## 2     2   129
# why are there more than one entry for some TFs?
surplus_TF_entries <- extracted_lines_clean %>%
  count(TF_name) %>%
  filter(n > 1) %>%
  pull(TF_name)

extracted_lines_clean %>%
  filter(TF_name %in% surplus_TF_entries)
## # A tibble: 258 × 4
##    TF_name part_tested class       note 
##    <chr>   <fct>       <fct>       <chr>
##  1 TFCP2   DBD         MethylPlus  C*   
##  2 TFCP2   FL          MethylPlus  W*   
##  3 ONECUT1 DBD         MethylMinus <NA> 
##  4 ONECUT1 FL          MethylMinus <NA> 
##  5 ONECUT2 DBD         MethylMinus <NA> 
##  6 ONECUT2 FL          MethylMinus <NA> 
##  7 ASCL2   DBD         No CpG      <NA> 
##  8 ASCL2   FL          MethylMinus S*   
##  9 BHLHE23 DBD         No CpG      <NA> 
## 10 BHLHE23 FL          No CpG      W*   
## # … with 248 more rows
# looks OK

#  - check part_tested
extracted_lines_clean %>%
  count(part_tested)
## # A tibble: 2 × 2
##   part_tested     n
##   <fct>       <int>
## 1 DBD           444
## 2 FL            227
#  - check class
extracted_lines_clean %>%
  count(class)
## # A tibble: 9 × 2
##   class                n
##   <fct>            <int>
## 1 inconclusive        17
## 2 Inconclusive        12
## 3 Little effect       39
## 4 MethylMinus        147
## 5 MethylMInus          1
## 6 MethylPlus         221
## 7 Motif dependent     15
## 8 Multiple effects    14
## 9 No CpG             205
# need to fix wrong typing
extracted_lines_clean <- extracted_lines_clean %>%
  mutate(class = as.character(class)) %>%
  mutate(class = case_when(
    class == "inconclusive" ~ "Inconclusive",
    class == "MethylMInus" ~ "MethylMinus",
    TRUE ~ class
  )) %>%
  mutate(class = as.factor(class))
extracted_lines_clean %>%
  count(class)
## # A tibble: 7 × 2
##   class                n
##   <fct>            <int>
## 1 Inconclusive        29
## 2 Little effect       39
## 3 MethylMinus        148
## 4 MethylPlus         221
## 5 Motif dependent     15
## 6 Multiple effects    14
## 7 No CpG             205
#  - check note
extracted_lines_clean %>%
  count(note)
## # A tibble: 16 × 2
##    note      n
##    <chr> <int>
##  1 A*        8
##  2 AC*       1
##  3 B*       23
##  4 C*       65
##  5 M*       15
##  6 N*        9
##  7 NS*       1
##  8 S*       91
##  9 SA*       1
## 10 SN*       2
## 11 U*        6
## 12 UW*       2
## 13 W*       27
## 14 WB*       2
## 15 WC*       6
## 16 <NA>    412
# need to remove the asterisk
extracted_lines_clean <- extracted_lines_clean %>%
  mutate(note = str_replace(note, fixed("*"), ""))

extracted_lines_clean %>%
  count(note)
## # A tibble: 16 × 2
##    note      n
##    <chr> <int>
##  1 A         8
##  2 AC        1
##  3 B        23
##  4 C        65
##  5 M        15
##  6 N         9
##  7 NS        1
##  8 S        91
##  9 SA        1
## 10 SN        2
## 11 U         6
## 12 UW        2
## 13 W        27
## 14 WB        2
## 15 WC        6
## 16 <NA>    412
# there are two types of 'note' that are the same: NS and SN
extracted_lines_clean <- extracted_lines_clean %>%
  mutate(note = ifelse(
    note == "SN",
    yes = "NS",
    no = note
  )) %>%
  mutate(note = as.factor(note))
extracted_lines_clean %>%
  count(note)
## # A tibble: 15 × 2
##    note      n
##    <fct> <int>
##  1 A         8
##  2 AC        1
##  3 B        23
##  4 C        65
##  5 M        15
##  6 N         9
##  7 NS        3
##  8 S        91
##  9 SA        1
## 10 U         6
## 11 UW        2
## 12 W        27
## 13 WB        2
## 14 WC        6
## 15 <NA>    412
# check everything once more
skimr::skim(extracted_lines_clean)
Data summary
Name extracted_lines_clean
Number of rows 671
Number of columns 4
_______________________
Column type frequency:
character 1
factor 3
________________________
Group variables None

Variable type: character

skim_variable n_missing complete_rate min max empty n_unique whitespace
TF_name 0 1 1 7 0 542 0

Variable type: factor

skim_variable n_missing complete_rate ordered n_unique top_counts
part_tested 0 1.00 FALSE 2 DBD: 444, FL: 227
class 0 1.00 FALSE 7 Met: 221, No : 205, Met: 148, Lit: 39
note 412 0.39 FALSE 14 S: 91, C: 65, W: 27, B: 23
# SAVE ---------------------
write_delim(
  extracted_lines_clean,
  file = here("DATA", "extracted_lines_clean_all.dat"),
  delim = "\t"
)