Skip to contents

This notebook parses AntiSmash and BigScape output and it is heavily inspired in the work of Roberto Sánchez.

https://github.com/robertosanchezn/AS_hqMAGs/blob/main/r_markdown/notebook/parse_bigscape.R

Antismash

Set up

library(RauENVS)
antismash_directory <- system.file(
  "antismash_example_directory",package =  "RauENVS"
  )
# Edit this to your actual file!!! and remove or comment the example file
# For example:
# antismash_directory <- "/mnt"

First, we need to identify the json files:

list_jsons <- function(dir) list.files(
  dir, ".json$", full.names = T,recursive = T,include.dirs = T
  )
(jsons <- list_jsons(antismash_directory))
#> [1] "/home/runner/work/_temp/Library/RauENVS/antismash_example_directory/antismash.json"
(genome_ids <- stringr::str_match(basename(jsons), "(^.+)\\.json")[,2])
#> [1] "antismash"
names(jsons) <- genome_ids

Get features

Now, we want to extract the features for each json file.

features <- get_antismash_features(jsons, c("region", "aSModule"))

Features is a large list, so inspect it carefully. It should have as many lists as json files:

stopifnot(length(features) == length(jsons))

Regions

Now, we are extracting all regions:

regions <- get_antismash_regions(features)
regions |>
  head(30) |>
  knitr::kable()
file location type contig_edge product contig bgc_id
antismash [292344:342785] region FALSE bacteriocin, NRPS .region001
antismash [999349:1105105] region FALSE transAT-PKS , transAT-PKS-like .region002
antismash [1222307:1263002] region FALSE T3PKS .region003
antismash [1347426:1367554] region FALSE terpene .region004
antismash [1397403:1532936] region FALSE betalactone, NRPS , transAT-PKS .region005
antismash [1724843:1834411] region FALSE NRPS , T3PKS , transAT-PKS , transAT-PKS-like .region006
antismash [2053604:2141836] region FALSE transAT-PKS .region007
antismash [2448155:2468896] region FALSE terpene .region008
antismash [2553885:2595130] region FALSE PKS-like .region009
antismash [2824697:2865819] region FALSE ladderane .region010
antismash [3163092:3228500] region FALSE NRPS .region011
antismash [3855452:3896871] region FALSE other .region012

You will notice that the product column is nested (each cell is a list of values). You can either paste the different values:

regions |>
  dplyr::mutate(
    product = purrr::map(product, paste0, collapse = ",") |>
      as.character()
    ) |>
  head(30) |>
  knitr::kable()
file location type contig_edge product contig bgc_id
antismash [292344:342785] region FALSE bacteriocin,NRPS .region001
antismash [999349:1105105] region FALSE transAT-PKS,transAT-PKS-like .region002
antismash [1222307:1263002] region FALSE T3PKS .region003
antismash [1347426:1367554] region FALSE terpene .region004
antismash [1397403:1532936] region FALSE betalactone,NRPS,transAT-PKS .region005
antismash [1724843:1834411] region FALSE NRPS,T3PKS,transAT-PKS,transAT-PKS-like .region006
antismash [2053604:2141836] region FALSE transAT-PKS .region007
antismash [2448155:2468896] region FALSE terpene .region008
antismash [2553885:2595130] region FALSE PKS-like .region009
antismash [2824697:2865819] region FALSE ladderane .region010
antismash [3163092:3228500] region FALSE NRPS .region011
antismash [3855452:3896871] region FALSE other .region012

Or you can unnest the DataFrame by creating a new row per each product:

regions |>
  tidyr::unnest("product")|>
  head(30) |>
  knitr::kable()
file location type contig_edge product contig bgc_id
antismash [292344:342785] region FALSE bacteriocin .region001
antismash [292344:342785] region FALSE NRPS .region001
antismash [999349:1105105] region FALSE transAT-PKS .region002
antismash [999349:1105105] region FALSE transAT-PKS-like .region002
antismash [1222307:1263002] region FALSE T3PKS .region003
antismash [1347426:1367554] region FALSE terpene .region004
antismash [1397403:1532936] region FALSE betalactone .region005
antismash [1397403:1532936] region FALSE NRPS .region005
antismash [1397403:1532936] region FALSE transAT-PKS .region005
antismash [1724843:1834411] region FALSE NRPS .region006
antismash [1724843:1834411] region FALSE T3PKS .region006
antismash [1724843:1834411] region FALSE transAT-PKS .region006
antismash [1724843:1834411] region FALSE transAT-PKS-like .region006
antismash [2053604:2141836] region FALSE transAT-PKS .region007
antismash [2448155:2468896] region FALSE terpene .region008
antismash [2553885:2595130] region FALSE PKS-like .region009
antismash [2824697:2865819] region FALSE ladderane .region010
antismash [3163092:3228500] region FALSE NRPS .region011
antismash [3855452:3896871] region FALSE other .region012

Remember to “save” your processed dataframe before saving it into a file. For example:

regions_unnest <- regions |>
  tidyr::unnest("product")
#write.csv(regions_unnest, "regions.csv")

Modules

Now, we extract the modules:

modules <- get_antismash_modules(features)
# write.csv(modules, "/mnt/modules.csv")
modules |>
  head(30) |>
  knitr::kable()
file location complete type iterative
antismash [295017:295515] FALSE unknown FALSE
antismash [312427:312892] FALSE unknown FALSE
antismash [314517:315729] FALSE nrps FALSE
antismash [316751:316937] FALSE unknown FALSE
antismash [317020:320092] TRUE nrps FALSE
antismash [320134:324067] TRUE nrps FALSE
antismash [1019210:1020041] FALSE pks FALSE
antismash [1022529:1022739] FALSE unknown FALSE
antismash [1022816:1023860] FALSE nrps FALSE
antismash [1024165:1024666] FALSE unknown FALSE
antismash [1024939:1025149] FALSE unknown FALSE
antismash [1025281:1030747] TRUE pks FALSE
antismash [1030891:1031074] FALSE unknown FALSE
antismash [1031221:1035418] TRUE pks FALSE
antismash [1035550:1037368] FALSE pks FALSE
antismash [1037719:1038733] FALSE unknown FALSE
antismash [1038892:1042354] TRUE pks FALSE
antismash [1042477:1043683] FALSE pks FALSE
antismash [1043898:1045017] FALSE unknown FALSE
antismash [1045152:1047294] TRUE pks FALSE
antismash [1047402:1049409] FALSE unknown FALSE
antismash [1050048:1051038] FALSE unknown FALSE
antismash [1051128:1051326] FALSE unknown FALSE
antismash [1051509:1055778] TRUE pks FALSE
antismash [1055943:1060086] TRUE pks FALSE
antismash [1060239:1063737] TRUE pks FALSE
antismash [1063857:1065108] FALSE pks FALSE
antismash [1065322:1066471] FALSE unknown FALSE
antismash [1067089:1067635] FALSE unknown FALSE
antismash [1067905:1071412] TRUE pks FALSE

BigScape

bigscape_directory <- system.file(
  "bigscape_example_directory",package =  "RauENVS"
  )
# Edit this to your actual file!!! and remove or comment the example file
# For example:
# bigscape_directory <- "/mnt/"

First, we will create a DataFrame with clustering information:

bigscape_clusters <- get_bigscape_clustering(bigscape_directory)
#write_csv(bigscape_clusters, "/mnt/bigscape_df.csv")
bigscape_clusters |>
  head(30) |>
  knitr::kable()
bgc_id class GCF_c0.30
10B3s2 Others 12
10B4 Others 16
5B2new Others 80
5C1 Others 92
5C1_4.region001 Others 92
CCC2s4new Others 92
CCC2s4new_14.region001 Others 92
10B2 Others 95
10B2new Others 95
10I4 Others 95
5B2 Others 95
5C4 Others 95
10C2 Others 125
10C2_4.region001 Others 125
10I1 Others 125
10I1_13.region001 Others 125
10I1new_43.region001 Others 125
10S3 Others 125
10S3_14.region001 Others 125
5B3 Others 125
5I3 Others 125
5I3_1.region001 Others 125
5I3new Others 125
5I3new_13.region001 Others 125
RTC2s1 Others 125
RTC2s1_3.region001 Others 125
RTC2s1new Others 125
RTS3_13.region001 Others 125
5I2 Others 190
5I2_2.region001 Others 190

We can extract more information. For example, for finding GCFs

library(tidyverse)
bigscape_clusters |>
    select(bgc_id, starts_with('GCF')) |>
    filter(str_detect(bgc_id, "^BGC\\d{7}\\.1")) |>
    select(-bgc_id) |>
    pull()
#> character(0)

Finally, we read network information:

bigscape_networks <- get_bigscape_networks(bigscape_directory)
#write_csv(bigscape_networks, "/mnt/bigscape_networks.csv")
bigscape_networks |>
  head(30) |>
  knitr::kable()
bgc_id_1 bgc_id_2 raw_distance squared_similarity cutoff class
10I5_27.region001 RTS3_6.region001 0.0001412 0.9997177 0.3 /home/runner/work/_temp/Library/RauENVS/bigscape_example_directory/mix
RTI3 RTI3_47.region001 0.0000000 1.0000000 0.3 /home/runner/work/_temp/Library/RauENVS/bigscape_example_directory/mix
LG9s2new_12.region001 RTB1new 0.0075563 0.9849444 0.3 /home/runner/work/_temp/Library/RauENVS/bigscape_example_directory/mix
Cout2_1.region001 RTI4new_10.region001 0.2747348 0.5260096 0.3 /home/runner/work/_temp/Library/RauENVS/bigscape_example_directory/mix
5I3new RTC2s1_31.region001 0.0187363 0.9628785 0.3 /home/runner/work/_temp/Library/RauENVS/bigscape_example_directory/mix
5I2new_9.region001 RTI2s1 0.2484404 0.5648419 0.3 /home/runner/work/_temp/Library/RauENVS/bigscape_example_directory/mix
Cout2 Gout2_30.region001 0.0001264 0.9997472 0.3 /home/runner/work/_temp/Library/RauENVS/bigscape_example_directory/mix
CA11s2_11.region001 Cout2_8.region001 0.2028430 0.6354593 0.3 /home/runner/work/_temp/Library/RauENVS/bigscape_example_directory/mix
5I2_1.region001 CCC3s4 0.0000000 1.0000000 0.3 /home/runner/work/_temp/Library/RauENVS/bigscape_example_directory/mix
CCC2s1 CCI2s3_7.region001 0.2955131 0.4963019 0.3 /home/runner/work/_temp/Library/RauENVS/bigscape_example_directory/mix
10C2_32.region001 10S3_3.region001 0.0004702 0.9990599 0.3 /home/runner/work/_temp/Library/RauENVS/bigscape_example_directory/mix
LG9s2new_1.region001 RTB1_65.region001 0.0174108 0.9654816 0.3 /home/runner/work/_temp/Library/RauENVS/bigscape_example_directory/mix
RTI4_25.region001 RTI4new 0.0000000 1.0000000 0.3 /home/runner/work/_temp/Library/RauENVS/bigscape_example_directory/mix
5I2_11.region001 5I2new 0.0000000 1.0000000 0.3 /home/runner/work/_temp/Library/RauENVS/bigscape_example_directory/mix
10I5 RTC2s1new_69.region001 0.1055529 0.8000355 0.3 /home/runner/work/_temp/Library/RauENVS/bigscape_example_directory/mix
5I3_1.region001 RTC2s1_3.region001 0.0515031 0.8996463 0.3 /home/runner/work/_temp/Library/RauENVS/bigscape_example_directory/mix
10S5 5S4_1.region002 0.0000000 1.0000000 0.3 /home/runner/work/_temp/Library/RauENVS/bigscape_example_directory/mix
10I1 10I1new_43.region001 0.0000000 1.0000000 0.3 /home/runner/work/_temp/Library/RauENVS/bigscape_example_directory/mix
10B2_97.region001 5C4 0.0000000 1.0000000 0.3 /home/runner/work/_temp/Library/RauENVS/bigscape_example_directory/mix
AA4_22.region001 AA4new_32.region001 0.0000000 1.0000000 0.3 /home/runner/work/_temp/Library/RauENVS/bigscape_example_directory/mix
LG9s2new LG9s2new_4.region001 0.0000000 1.0000000 0.3 /home/runner/work/_temp/Library/RauENVS/bigscape_example_directory/mix
10B2new_2.region001 5C4_19.region001 0.0000000 1.0000000 0.3 /home/runner/work/_temp/Library/RauENVS/bigscape_example_directory/mix
5S4_13.region001 AB12_1.region002 0.0480534 0.9062024 0.3 /home/runner/work/_temp/Library/RauENVS/bigscape_example_directory/mix
AA4new_32.region001 RTS4_82.region001 0.0009455 0.9981099 0.3 /home/runner/work/_temp/Library/RauENVS/bigscape_example_directory/mix
10S3_3.region002 5I3_2.region001 0.1382006 0.7426981 0.3 /home/runner/work/_temp/Library/RauENVS/bigscape_example_directory/mix
RTB1new RTB1new_3.region001 0.0000000 1.0000000 0.3 /home/runner/work/_temp/Library/RauENVS/bigscape_example_directory/mix
CCC2s1 CCC2s1_8.region001 0.0000000 1.0000000 0.3 /home/runner/work/_temp/Library/RauENVS/bigscape_example_directory/mix
10B2_97.region001 10I4_53.region001 0.0000000 1.0000000 0.3 /home/runner/work/_temp/Library/RauENVS/bigscape_example_directory/mix
LG9s2_19.region001 RTB1_17.region001 0.0096855 0.9807229 0.3 /home/runner/work/_temp/Library/RauENVS/bigscape_example_directory/mix
RTI2s1 RTI2s2new_5.region001 0.0000000 1.0000000 0.3 /home/runner/work/_temp/Library/RauENVS/bigscape_example_directory/mix