create_TF_methyl_binding

# DESCRIPTION ---------------
#   Get the data extracted from Yin, Y., et al. (2017). 'Impact of cytosine
#   methylation on DNA binding specificities of human transcription factors.'
#   Science, 356(6337), eaaj2239. https://doi.org/10.1126/science.aaj2239 .
#   The supplementary PDF file (Databases_S1_for_submission.pdf) includes
#   all the measured TFs with their classification based on whether their
#   binding sequence includes a CpG or not and based on whether they bind
#   better to the methylated CpG or not.
#   PDF is not a good way of keeping that information; I've exported the text
#   only into DATA/yin_science_2017_TF_types.txt and then ran the script
#   'extract_data_from_yin.sh' to create a file with one TF per line
#   (DATA/extracted_lines_clean.txt).
#   I can analyse this data from there.
#
#  From the supplementary info - some details on special marking:
#    *A*      Signal from Another TF family: Raw sequence data contains reads
#             belonging to another TF family. These are not included to the
#             model presented as the seed excludes them. Information provided
#             to aid bioinformatic analyses of full data.
#    *B*      CG frequency Below cut-off: CG containing subsequences were
#             detected, but frequency of the CG was below the 10% cut-off for
#             calling the effect of methylation on binding.
#    *C*      Call based on bisulfite SELEX: kmer analysis does not detect
#             effect, call is based on bisulfite SELEX. This is caused mostly
#             by absolute requirement for CG in site, in which case the small
#             fraction of unmethylated CG containing ligands that is left after
#             MSssI treatment is enriched.
#    *M*      Motif-dependent effect: TF has multiple motifs, with different
#             effect of CG methylation.
#    *N*      Weak enrichment with Non-specific signal: Enrichment for the motif
#             was weak, there is also signal for enrichment of subsequences
#             that enrich weakly in many weak or failed experiments.
#    *S*      Suboptimal k-mer length: kmer is too short or long compared to
#             the corresponding motif. 8mer analysis does not detect the CG or
#             doesn't detect the CG at the correct position.
#    *U*      Unique reads used to detect signal: Selection library has low
#             diversity after selection, unique reads used to detect signal.
#    *W*      Weak: Obtained motif has relatively low number of reads or weak
#             enrichment.
#
#   Additionally, the abbreviations after the name of the TF mean:
#    *FL*    full length of the protein was tested
#    *DBD*   DNA-binding domain
#    *eDBD*  extended DNA-binding domain
#
# AUTHOR: Julia Romanowska
# DATE CREATED: 2022-01-06
# DATE MODIFIED:

# SETUP -----------------
library(tidyverse)

## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.1 ──

## ✓ ggplot2 3.3.5     ✓ purrr   0.3.4
## ✓ tibble  3.1.6     ✓ dplyr   1.0.7
## ✓ tidyr   1.1.3     ✓ stringr 1.4.0
## ✓ readr   2.0.0     ✓ forcats 0.5.1

## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()

library(here)

## here() starts at /scratch/Naukowe/START_project/X-chrom

# READ DATA -----------------------
extracted_lines <- read_delim(
  here("DATA", "extracted_lines_clean.txt"),
  delim = ":",
  col_names = FALSE
)

## Rows: 671 Columns: 2

## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ":"
## chr (2): X1, X2

## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

extracted_lines

## # A tibble: 671 × 2
##    X1               X2               
##    <chr>            <chr>            
##  1 "TFAP2A _ FL "   " MethylMinus"   
##  2 "TFAP2B _ DBD "  " MethylMinus"   
##  3 "TFAP2C _ DBD "  " MethylMinus"   
##  4 "TFAP2E _ FL "   " MethylMinus"   
##  5 "GRHL1 _ DBD "   " MethylPlus *C*"
##  6 "TFCP2 _ DBD "   " MethylPlus *C*"
##  7 "TFCP2 _ FL "    " MethylPlus *W*"
##  8 "TFCP2L1 _ DBD " " MethylPlus *C*"
##  9 "UBP1 _ DBD "    " MethylPlus *C*"
## 10 "YBX1 _ DBD "    " No CpG"        
## # … with 661 more rows

colnames(extracted_lines) <- c("TF_full_name", "class_full")
extracted_lines

## # A tibble: 671 × 2
##    TF_full_name     class_full       
##    <chr>            <chr>            
##  1 "TFAP2A _ FL "   " MethylMinus"   
##  2 "TFAP2B _ DBD "  " MethylMinus"   
##  3 "TFAP2C _ DBD "  " MethylMinus"   
##  4 "TFAP2E _ FL "   " MethylMinus"   
##  5 "GRHL1 _ DBD "   " MethylPlus *C*"
##  6 "TFCP2 _ DBD "   " MethylPlus *C*"
##  7 "TFCP2 _ FL "    " MethylPlus *W*"
##  8 "TFCP2L1 _ DBD " " MethylPlus *C*"
##  9 "UBP1 _ DBD "    " MethylPlus *C*"
## 10 "YBX1 _ DBD "    " No CpG"        
## # … with 661 more rows

# TIDY ------------------
# remove all the whitespace characters and divide into extra columns
extracted_lines_clean <- extracted_lines %>%
  separate(
    col = TF_full_name,
    into = c("TF_name", "part_tested"),
    sep = "_"
  ) %>%
  separate(
    col = class_full,
    into = c("class", "note"),
    sep = "\\*",
    extra = "merge",
    fill = "right"
  ) %>%
  mutate(across(everything(), str_trim)) %>%
  mutate(across(c(class, part_tested), as.factor))
extracted_lines_clean

## # A tibble: 671 × 4
##    TF_name part_tested class       note 
##    <chr>   <fct>       <fct>       <chr>
##  1 TFAP2A  FL          MethylMinus <NA> 
##  2 TFAP2B  DBD         MethylMinus <NA> 
##  3 TFAP2C  DBD         MethylMinus <NA> 
##  4 TFAP2E  FL          MethylMinus <NA> 
##  5 GRHL1   DBD         MethylPlus  C*   
##  6 TFCP2   DBD         MethylPlus  C*   
##  7 TFCP2   FL          MethylPlus  W*   
##  8 TFCP2L1 DBD         MethylPlus  C*   
##  9 UBP1    DBD         MethylPlus  C*   
## 10 YBX1    DBD         No CpG      <NA> 
## # … with 661 more rows

skimr::skim(extracted_lines_clean)

Data summary
Name	extracted_lines_clean
Number of rows	671
Number of columns	4
_______________________
Column type frequency:
character	2
factor	2
________________________
Group variables	None

Variable type: character

skim_variable	n_missing	complete_rate	min	max	empty	n_unique	whitespace
TF_name	0	1.00	1	7	0	542	0
note	412	0.39	2	3	0	15	0

Variable type: factor

skim_variable	n_missing	complete_rate	ordered	n_unique	top_counts
part_tested	0	1	FALSE	2	DBD: 444, FL: 227
class	0	1	FALSE	9	Met: 221, No : 205, Met: 147, Lit: 39

# check the various classes
#  - TF name
extracted_lines_clean %>%
  count(TF_name) %>%
  count(n)

## Storing counts in `nn`, as `n` already present in input
## ℹ Use `name = "new_name"` to pick a new name.

## # A tibble: 2 × 2
##       n    nn
##   <int> <int>
## 1     1   413
## 2     2   129

# why are there more than one entry for some TFs?
surplus_TF_entries <- extracted_lines_clean %>%
  count(TF_name) %>%
  filter(n > 1) %>%
  pull(TF_name)

extracted_lines_clean %>%
  filter(TF_name %in% surplus_TF_entries)

## # A tibble: 258 × 4
##    TF_name part_tested class       note 
##    <chr>   <fct>       <fct>       <chr>
##  1 TFCP2   DBD         MethylPlus  C*   
##  2 TFCP2   FL          MethylPlus  W*   
##  3 ONECUT1 DBD         MethylMinus <NA> 
##  4 ONECUT1 FL          MethylMinus <NA> 
##  5 ONECUT2 DBD         MethylMinus <NA> 
##  6 ONECUT2 FL          MethylMinus <NA> 
##  7 ASCL2   DBD         No CpG      <NA> 
##  8 ASCL2   FL          MethylMinus S*   
##  9 BHLHE23 DBD         No CpG      <NA> 
## 10 BHLHE23 FL          No CpG      W*   
## # … with 248 more rows

# looks OK

#  - check part_tested
extracted_lines_clean %>%
  count(part_tested)

## # A tibble: 2 × 2
##   part_tested     n
##   <fct>       <int>
## 1 DBD           444
## 2 FL            227

#  - check class
extracted_lines_clean %>%
  count(class)

## # A tibble: 9 × 2
##   class                n
##   <fct>            <int>
## 1 inconclusive        17
## 2 Inconclusive        12
## 3 Little effect       39
## 4 MethylMinus        147
## 5 MethylMInus          1
## 6 MethylPlus         221
## 7 Motif dependent     15
## 8 Multiple effects    14
## 9 No CpG             205

# need to fix wrong typing
extracted_lines_clean <- extracted_lines_clean %>%
  mutate(class = as.character(class)) %>%
  mutate(class = case_when(
    class == "inconclusive" ~ "Inconclusive",
    class == "MethylMInus" ~ "MethylMinus",
    TRUE ~ class
  )) %>%
  mutate(class = as.factor(class))
extracted_lines_clean %>%
  count(class)

## # A tibble: 7 × 2
##   class                n
##   <fct>            <int>
## 1 Inconclusive        29
## 2 Little effect       39
## 3 MethylMinus        148
## 4 MethylPlus         221
## 5 Motif dependent     15
## 6 Multiple effects    14
## 7 No CpG             205

#  - check note
extracted_lines_clean %>%
  count(note)

## # A tibble: 16 × 2
##    note      n
##    <chr> <int>
##  1 A*        8
##  2 AC*       1
##  3 B*       23
##  4 C*       65
##  5 M*       15
##  6 N*        9
##  7 NS*       1
##  8 S*       91
##  9 SA*       1
## 10 SN*       2
## 11 U*        6
## 12 UW*       2
## 13 W*       27
## 14 WB*       2
## 15 WC*       6
## 16 <NA>    412

# need to remove the asterisk
extracted_lines_clean <- extracted_lines_clean %>%
  mutate(note = str_replace(note, fixed("*"), ""))

extracted_lines_clean %>%
  count(note)

## # A tibble: 16 × 2
##    note      n
##    <chr> <int>
##  1 A         8
##  2 AC        1
##  3 B        23
##  4 C        65
##  5 M        15
##  6 N         9
##  7 NS        1
##  8 S        91
##  9 SA        1
## 10 SN        2
## 11 U         6
## 12 UW        2
## 13 W        27
## 14 WB        2
## 15 WC        6
## 16 <NA>    412

# there are two types of 'note' that are the same: NS and SN
extracted_lines_clean <- extracted_lines_clean %>%
  mutate(note = ifelse(
    note == "SN",
    yes = "NS",
    no = note
  )) %>%
  mutate(note = as.factor(note))
extracted_lines_clean %>%
  count(note)

## # A tibble: 15 × 2
##    note      n
##    <fct> <int>
##  1 A         8
##  2 AC        1
##  3 B        23
##  4 C        65
##  5 M        15
##  6 N         9
##  7 NS        3
##  8 S        91
##  9 SA        1
## 10 U         6
## 11 UW        2
## 12 W        27
## 13 WB        2
## 14 WC        6
## 15 <NA>    412

# check everything once more
skimr::skim(extracted_lines_clean)

Data summary
Name	extracted_lines_clean
Number of rows	671
Number of columns	4
_______________________
Column type frequency:
character	1
factor	3
________________________
Group variables	None

Variable type: character

skim_variable	n_missing	complete_rate	min	max	empty	n_unique	whitespace
TF_name	0	1	1	7	0	542	0

Variable type: factor

skim_variable	n_missing	complete_rate	ordered	n_unique	top_counts
part_tested	0	1.00	FALSE	2	DBD: 444, FL: 227
class	0	1.00	FALSE	7	Met: 221, No : 205, Met: 148, Lit: 39
note	412	0.39	FALSE	14	S: 91, C: 65, W: 27, B: 23

# SAVE ---------------------
write_delim(
  extracted_lines_clean,
  file = here("DATA", "extracted_lines_clean_all.dat"),
  delim = "\t"
)

create_TF_methyl_binding_dataset.R

jro049

2022-01-06