# DESCRIPTION ---------------
# Get the data extracted from Yin, Y., et al. (2017). 'Impact of cytosine
# methylation on DNA binding specificities of human transcription factors.'
# Science, 356(6337), eaaj2239. https://doi.org/10.1126/science.aaj2239 .
# The supplementary PDF file (Databases_S1_for_submission.pdf) includes
# all the measured TFs with their classification based on whether their
# binding sequence includes a CpG or not and based on whether they bind
# better to the methylated CpG or not.
# PDF is not a good way of keeping that information; I've exported the text
# only into DATA/yin_science_2017_TF_types.txt and then ran the script
# 'extract_data_from_yin.sh' to create a file with one TF per line
# (DATA/extracted_lines_clean.txt).
# I can analyse this data from there.
#
# From the supplementary info - some details on special marking:
# *A* Signal from Another TF family: Raw sequence data contains reads
# belonging to another TF family. These are not included to the
# model presented as the seed excludes them. Information provided
# to aid bioinformatic analyses of full data.
# *B* CG frequency Below cut-off: CG containing subsequences were
# detected, but frequency of the CG was below the 10% cut-off for
# calling the effect of methylation on binding.
# *C* Call based on bisulfite SELEX: kmer analysis does not detect
# effect, call is based on bisulfite SELEX. This is caused mostly
# by absolute requirement for CG in site, in which case the small
# fraction of unmethylated CG containing ligands that is left after
# MSssI treatment is enriched.
# *M* Motif-dependent effect: TF has multiple motifs, with different
# effect of CG methylation.
# *N* Weak enrichment with Non-specific signal: Enrichment for the motif
# was weak, there is also signal for enrichment of subsequences
# that enrich weakly in many weak or failed experiments.
# *S* Suboptimal k-mer length: kmer is too short or long compared to
# the corresponding motif. 8mer analysis does not detect the CG or
# doesn't detect the CG at the correct position.
# *U* Unique reads used to detect signal: Selection library has low
# diversity after selection, unique reads used to detect signal.
# *W* Weak: Obtained motif has relatively low number of reads or weak
# enrichment.
#
# Additionally, the abbreviations after the name of the TF mean:
# *FL* full length of the protein was tested
# *DBD* DNA-binding domain
# *eDBD* extended DNA-binding domain
#
# AUTHOR: Julia Romanowska
# DATE CREATED: 2022-01-06
# DATE MODIFIED:
# SETUP -----------------
library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.1 ──
## ✓ ggplot2 3.3.5 ✓ purrr 0.3.4
## ✓ tibble 3.1.6 ✓ dplyr 1.0.7
## ✓ tidyr 1.1.3 ✓ stringr 1.4.0
## ✓ readr 2.0.0 ✓ forcats 0.5.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
library(here)
## here() starts at /scratch/Naukowe/START_project/X-chrom
# READ DATA -----------------------
extracted_lines <- read_delim(
here("DATA", "extracted_lines_clean.txt"),
delim = ":",
col_names = FALSE
)
## Rows: 671 Columns: 2
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ":"
## chr (2): X1, X2
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
extracted_lines
## # A tibble: 671 × 2
## X1 X2
## <chr> <chr>
## 1 "TFAP2A _ FL " " MethylMinus"
## 2 "TFAP2B _ DBD " " MethylMinus"
## 3 "TFAP2C _ DBD " " MethylMinus"
## 4 "TFAP2E _ FL " " MethylMinus"
## 5 "GRHL1 _ DBD " " MethylPlus *C*"
## 6 "TFCP2 _ DBD " " MethylPlus *C*"
## 7 "TFCP2 _ FL " " MethylPlus *W*"
## 8 "TFCP2L1 _ DBD " " MethylPlus *C*"
## 9 "UBP1 _ DBD " " MethylPlus *C*"
## 10 "YBX1 _ DBD " " No CpG"
## # … with 661 more rows
colnames(extracted_lines) <- c("TF_full_name", "class_full")
extracted_lines
## # A tibble: 671 × 2
## TF_full_name class_full
## <chr> <chr>
## 1 "TFAP2A _ FL " " MethylMinus"
## 2 "TFAP2B _ DBD " " MethylMinus"
## 3 "TFAP2C _ DBD " " MethylMinus"
## 4 "TFAP2E _ FL " " MethylMinus"
## 5 "GRHL1 _ DBD " " MethylPlus *C*"
## 6 "TFCP2 _ DBD " " MethylPlus *C*"
## 7 "TFCP2 _ FL " " MethylPlus *W*"
## 8 "TFCP2L1 _ DBD " " MethylPlus *C*"
## 9 "UBP1 _ DBD " " MethylPlus *C*"
## 10 "YBX1 _ DBD " " No CpG"
## # … with 661 more rows
# TIDY ------------------
# remove all the whitespace characters and divide into extra columns
extracted_lines_clean <- extracted_lines %>%
separate(
col = TF_full_name,
into = c("TF_name", "part_tested"),
sep = "_"
) %>%
separate(
col = class_full,
into = c("class", "note"),
sep = "\\*",
extra = "merge",
fill = "right"
) %>%
mutate(across(everything(), str_trim)) %>%
mutate(across(c(class, part_tested), as.factor))
extracted_lines_clean
## # A tibble: 671 × 4
## TF_name part_tested class note
## <chr> <fct> <fct> <chr>
## 1 TFAP2A FL MethylMinus <NA>
## 2 TFAP2B DBD MethylMinus <NA>
## 3 TFAP2C DBD MethylMinus <NA>
## 4 TFAP2E FL MethylMinus <NA>
## 5 GRHL1 DBD MethylPlus C*
## 6 TFCP2 DBD MethylPlus C*
## 7 TFCP2 FL MethylPlus W*
## 8 TFCP2L1 DBD MethylPlus C*
## 9 UBP1 DBD MethylPlus C*
## 10 YBX1 DBD No CpG <NA>
## # … with 661 more rows
skimr::skim(extracted_lines_clean)
Data summary
Name |
extracted_lines_clean |
Number of rows |
671 |
Number of columns |
4 |
_______________________ |
|
Column type frequency: |
|
character |
2 |
factor |
2 |
________________________ |
|
Group variables |
None |
Variable type: character
TF_name |
0 |
1.00 |
1 |
7 |
0 |
542 |
0 |
note |
412 |
0.39 |
2 |
3 |
0 |
15 |
0 |
Variable type: factor
part_tested |
0 |
1 |
FALSE |
2 |
DBD: 444, FL: 227 |
class |
0 |
1 |
FALSE |
9 |
Met: 221, No : 205, Met: 147, Lit: 39 |
# check the various classes
# - TF name
extracted_lines_clean %>%
count(TF_name) %>%
count(n)
## Storing counts in `nn`, as `n` already present in input
## ℹ Use `name = "new_name"` to pick a new name.
## # A tibble: 2 × 2
## n nn
## <int> <int>
## 1 1 413
## 2 2 129
# why are there more than one entry for some TFs?
surplus_TF_entries <- extracted_lines_clean %>%
count(TF_name) %>%
filter(n > 1) %>%
pull(TF_name)
extracted_lines_clean %>%
filter(TF_name %in% surplus_TF_entries)
## # A tibble: 258 × 4
## TF_name part_tested class note
## <chr> <fct> <fct> <chr>
## 1 TFCP2 DBD MethylPlus C*
## 2 TFCP2 FL MethylPlus W*
## 3 ONECUT1 DBD MethylMinus <NA>
## 4 ONECUT1 FL MethylMinus <NA>
## 5 ONECUT2 DBD MethylMinus <NA>
## 6 ONECUT2 FL MethylMinus <NA>
## 7 ASCL2 DBD No CpG <NA>
## 8 ASCL2 FL MethylMinus S*
## 9 BHLHE23 DBD No CpG <NA>
## 10 BHLHE23 FL No CpG W*
## # … with 248 more rows
# looks OK
# - check part_tested
extracted_lines_clean %>%
count(part_tested)
## # A tibble: 2 × 2
## part_tested n
## <fct> <int>
## 1 DBD 444
## 2 FL 227
# - check class
extracted_lines_clean %>%
count(class)
## # A tibble: 9 × 2
## class n
## <fct> <int>
## 1 inconclusive 17
## 2 Inconclusive 12
## 3 Little effect 39
## 4 MethylMinus 147
## 5 MethylMInus 1
## 6 MethylPlus 221
## 7 Motif dependent 15
## 8 Multiple effects 14
## 9 No CpG 205
# need to fix wrong typing
extracted_lines_clean <- extracted_lines_clean %>%
mutate(class = as.character(class)) %>%
mutate(class = case_when(
class == "inconclusive" ~ "Inconclusive",
class == "MethylMInus" ~ "MethylMinus",
TRUE ~ class
)) %>%
mutate(class = as.factor(class))
extracted_lines_clean %>%
count(class)
## # A tibble: 7 × 2
## class n
## <fct> <int>
## 1 Inconclusive 29
## 2 Little effect 39
## 3 MethylMinus 148
## 4 MethylPlus 221
## 5 Motif dependent 15
## 6 Multiple effects 14
## 7 No CpG 205
# - check note
extracted_lines_clean %>%
count(note)
## # A tibble: 16 × 2
## note n
## <chr> <int>
## 1 A* 8
## 2 AC* 1
## 3 B* 23
## 4 C* 65
## 5 M* 15
## 6 N* 9
## 7 NS* 1
## 8 S* 91
## 9 SA* 1
## 10 SN* 2
## 11 U* 6
## 12 UW* 2
## 13 W* 27
## 14 WB* 2
## 15 WC* 6
## 16 <NA> 412
# need to remove the asterisk
extracted_lines_clean <- extracted_lines_clean %>%
mutate(note = str_replace(note, fixed("*"), ""))
extracted_lines_clean %>%
count(note)
## # A tibble: 16 × 2
## note n
## <chr> <int>
## 1 A 8
## 2 AC 1
## 3 B 23
## 4 C 65
## 5 M 15
## 6 N 9
## 7 NS 1
## 8 S 91
## 9 SA 1
## 10 SN 2
## 11 U 6
## 12 UW 2
## 13 W 27
## 14 WB 2
## 15 WC 6
## 16 <NA> 412
# there are two types of 'note' that are the same: NS and SN
extracted_lines_clean <- extracted_lines_clean %>%
mutate(note = ifelse(
note == "SN",
yes = "NS",
no = note
)) %>%
mutate(note = as.factor(note))
extracted_lines_clean %>%
count(note)
## # A tibble: 15 × 2
## note n
## <fct> <int>
## 1 A 8
## 2 AC 1
## 3 B 23
## 4 C 65
## 5 M 15
## 6 N 9
## 7 NS 3
## 8 S 91
## 9 SA 1
## 10 U 6
## 11 UW 2
## 12 W 27
## 13 WB 2
## 14 WC 6
## 15 <NA> 412
# check everything once more
skimr::skim(extracted_lines_clean)
Data summary
Name |
extracted_lines_clean |
Number of rows |
671 |
Number of columns |
4 |
_______________________ |
|
Column type frequency: |
|
character |
1 |
factor |
3 |
________________________ |
|
Group variables |
None |
Variable type: character
Variable type: factor
part_tested |
0 |
1.00 |
FALSE |
2 |
DBD: 444, FL: 227 |
class |
0 |
1.00 |
FALSE |
7 |
Met: 221, No : 205, Met: 148, Lit: 39 |
note |
412 |
0.39 |
FALSE |
14 |
S: 91, C: 65, W: 27, B: 23 |
# SAVE ---------------------
write_delim(
extracted_lines_clean,
file = here("DATA", "extracted_lines_clean_all.dat"),
delim = "\t"
)