Parse AntiSmash and BigScape
Author Placeholder
AU-ENVS Bioinformaticsparse_antismash.Rmd
This notebook parses AntiSmash and BigScape output and it is heavily inspired in the work of Roberto Sánchez.
https://github.com/robertosanchezn/AS_hqMAGs/blob/main/r_markdown/notebook/parse_bigscape.R
Antismash
Set up
library(RauENVS)
antismash_directory <- system.file(
"antismash_example_directory",package = "RauENVS"
)
# Edit this to your actual file!!! and remove or comment the example file
# For example:
# antismash_directory <- "/mnt"
First, we need to identify the json files:
list_jsons <- function(dir) list.files(
dir, ".json$", full.names = T,recursive = T,include.dirs = T
)
(jsons <- list_jsons(antismash_directory))
#> [1] "/home/runner/work/_temp/Library/RauENVS/antismash_example_directory/antismash.json"
(genome_ids <- stringr::str_match(basename(jsons), "(^.+)\\.json")[,2])
#> [1] "antismash"
names(jsons) <- genome_ids
Get features
Now, we want to extract the features for each json file.
features <- get_antismash_features(jsons, c("region", "aSModule"))
Features is a large list, so inspect it carefully. It should have as many lists as json files:
Regions
Now, we are extracting all regions:
regions <- get_antismash_regions(features)
regions |>
head(30) |>
knitr::kable()
file | location | type | contig_edge | product | contig | bgc_id |
---|---|---|---|---|---|---|
antismash | [292344:342785] | region | FALSE | bacteriocin, NRPS | .region001 | |
antismash | [999349:1105105] | region | FALSE | transAT-PKS , transAT-PKS-like | .region002 | |
antismash | [1222307:1263002] | region | FALSE | T3PKS | .region003 | |
antismash | [1347426:1367554] | region | FALSE | terpene | .region004 | |
antismash | [1397403:1532936] | region | FALSE | betalactone, NRPS , transAT-PKS | .region005 | |
antismash | [1724843:1834411] | region | FALSE | NRPS , T3PKS , transAT-PKS , transAT-PKS-like | .region006 | |
antismash | [2053604:2141836] | region | FALSE | transAT-PKS | .region007 | |
antismash | [2448155:2468896] | region | FALSE | terpene | .region008 | |
antismash | [2553885:2595130] | region | FALSE | PKS-like | .region009 | |
antismash | [2824697:2865819] | region | FALSE | ladderane | .region010 | |
antismash | [3163092:3228500] | region | FALSE | NRPS | .region011 | |
antismash | [3855452:3896871] | region | FALSE | other | .region012 |
You will notice that the product column is nested (each cell is a list of values). You can either paste the different values:
regions |>
dplyr::mutate(
product = purrr::map(product, paste0, collapse = ",") |>
as.character()
) |>
head(30) |>
knitr::kable()
file | location | type | contig_edge | product | contig | bgc_id |
---|---|---|---|---|---|---|
antismash | [292344:342785] | region | FALSE | bacteriocin,NRPS | .region001 | |
antismash | [999349:1105105] | region | FALSE | transAT-PKS,transAT-PKS-like | .region002 | |
antismash | [1222307:1263002] | region | FALSE | T3PKS | .region003 | |
antismash | [1347426:1367554] | region | FALSE | terpene | .region004 | |
antismash | [1397403:1532936] | region | FALSE | betalactone,NRPS,transAT-PKS | .region005 | |
antismash | [1724843:1834411] | region | FALSE | NRPS,T3PKS,transAT-PKS,transAT-PKS-like | .region006 | |
antismash | [2053604:2141836] | region | FALSE | transAT-PKS | .region007 | |
antismash | [2448155:2468896] | region | FALSE | terpene | .region008 | |
antismash | [2553885:2595130] | region | FALSE | PKS-like | .region009 | |
antismash | [2824697:2865819] | region | FALSE | ladderane | .region010 | |
antismash | [3163092:3228500] | region | FALSE | NRPS | .region011 | |
antismash | [3855452:3896871] | region | FALSE | other | .region012 |
Or you can unnest the DataFrame by creating a new row per each product:
file | location | type | contig_edge | product | contig | bgc_id |
---|---|---|---|---|---|---|
antismash | [292344:342785] | region | FALSE | bacteriocin | .region001 | |
antismash | [292344:342785] | region | FALSE | NRPS | .region001 | |
antismash | [999349:1105105] | region | FALSE | transAT-PKS | .region002 | |
antismash | [999349:1105105] | region | FALSE | transAT-PKS-like | .region002 | |
antismash | [1222307:1263002] | region | FALSE | T3PKS | .region003 | |
antismash | [1347426:1367554] | region | FALSE | terpene | .region004 | |
antismash | [1397403:1532936] | region | FALSE | betalactone | .region005 | |
antismash | [1397403:1532936] | region | FALSE | NRPS | .region005 | |
antismash | [1397403:1532936] | region | FALSE | transAT-PKS | .region005 | |
antismash | [1724843:1834411] | region | FALSE | NRPS | .region006 | |
antismash | [1724843:1834411] | region | FALSE | T3PKS | .region006 | |
antismash | [1724843:1834411] | region | FALSE | transAT-PKS | .region006 | |
antismash | [1724843:1834411] | region | FALSE | transAT-PKS-like | .region006 | |
antismash | [2053604:2141836] | region | FALSE | transAT-PKS | .region007 | |
antismash | [2448155:2468896] | region | FALSE | terpene | .region008 | |
antismash | [2553885:2595130] | region | FALSE | PKS-like | .region009 | |
antismash | [2824697:2865819] | region | FALSE | ladderane | .region010 | |
antismash | [3163092:3228500] | region | FALSE | NRPS | .region011 | |
antismash | [3855452:3896871] | region | FALSE | other | .region012 |
Remember to “save” your processed dataframe before saving it into a file. For example:
regions_unnest <- regions |>
tidyr::unnest("product")
#write.csv(regions_unnest, "regions.csv")
Modules
Now, we extract the modules:
modules <- get_antismash_modules(features)
# write.csv(modules, "/mnt/modules.csv")
modules |>
head(30) |>
knitr::kable()
file | location | complete | type | iterative |
---|---|---|---|---|
antismash | [295017:295515] | FALSE | unknown | FALSE |
antismash | [312427:312892] | FALSE | unknown | FALSE |
antismash | [314517:315729] | FALSE | nrps | FALSE |
antismash | [316751:316937] | FALSE | unknown | FALSE |
antismash | [317020:320092] | TRUE | nrps | FALSE |
antismash | [320134:324067] | TRUE | nrps | FALSE |
antismash | [1019210:1020041] | FALSE | pks | FALSE |
antismash | [1022529:1022739] | FALSE | unknown | FALSE |
antismash | [1022816:1023860] | FALSE | nrps | FALSE |
antismash | [1024165:1024666] | FALSE | unknown | FALSE |
antismash | [1024939:1025149] | FALSE | unknown | FALSE |
antismash | [1025281:1030747] | TRUE | pks | FALSE |
antismash | [1030891:1031074] | FALSE | unknown | FALSE |
antismash | [1031221:1035418] | TRUE | pks | FALSE |
antismash | [1035550:1037368] | FALSE | pks | FALSE |
antismash | [1037719:1038733] | FALSE | unknown | FALSE |
antismash | [1038892:1042354] | TRUE | pks | FALSE |
antismash | [1042477:1043683] | FALSE | pks | FALSE |
antismash | [1043898:1045017] | FALSE | unknown | FALSE |
antismash | [1045152:1047294] | TRUE | pks | FALSE |
antismash | [1047402:1049409] | FALSE | unknown | FALSE |
antismash | [1050048:1051038] | FALSE | unknown | FALSE |
antismash | [1051128:1051326] | FALSE | unknown | FALSE |
antismash | [1051509:1055778] | TRUE | pks | FALSE |
antismash | [1055943:1060086] | TRUE | pks | FALSE |
antismash | [1060239:1063737] | TRUE | pks | FALSE |
antismash | [1063857:1065108] | FALSE | pks | FALSE |
antismash | [1065322:1066471] | FALSE | unknown | FALSE |
antismash | [1067089:1067635] | FALSE | unknown | FALSE |
antismash | [1067905:1071412] | TRUE | pks | FALSE |
BigScape
bigscape_directory <- system.file(
"bigscape_example_directory",package = "RauENVS"
)
# Edit this to your actual file!!! and remove or comment the example file
# For example:
# bigscape_directory <- "/mnt/"
First, we will create a DataFrame with clustering information:
bigscape_clusters <- get_bigscape_clustering(bigscape_directory)
#write_csv(bigscape_clusters, "/mnt/bigscape_df.csv")
bigscape_clusters |>
head(30) |>
knitr::kable()
bgc_id | class | GCF_c0.30 |
---|---|---|
10B3s2 | Others | 12 |
10B4 | Others | 16 |
5B2new | Others | 80 |
5C1 | Others | 92 |
5C1_4.region001 | Others | 92 |
CCC2s4new | Others | 92 |
CCC2s4new_14.region001 | Others | 92 |
10B2 | Others | 95 |
10B2new | Others | 95 |
10I4 | Others | 95 |
5B2 | Others | 95 |
5C4 | Others | 95 |
10C2 | Others | 125 |
10C2_4.region001 | Others | 125 |
10I1 | Others | 125 |
10I1_13.region001 | Others | 125 |
10I1new_43.region001 | Others | 125 |
10S3 | Others | 125 |
10S3_14.region001 | Others | 125 |
5B3 | Others | 125 |
5I3 | Others | 125 |
5I3_1.region001 | Others | 125 |
5I3new | Others | 125 |
5I3new_13.region001 | Others | 125 |
RTC2s1 | Others | 125 |
RTC2s1_3.region001 | Others | 125 |
RTC2s1new | Others | 125 |
RTS3_13.region001 | Others | 125 |
5I2 | Others | 190 |
5I2_2.region001 | Others | 190 |
We can extract more information. For example, for finding GCFs
library(tidyverse)
bigscape_clusters |>
select(bgc_id, starts_with('GCF')) |>
filter(str_detect(bgc_id, "^BGC\\d{7}\\.1")) |>
select(-bgc_id) |>
pull()
#> character(0)
Finally, we read network information:
bigscape_networks <- get_bigscape_networks(bigscape_directory)
#write_csv(bigscape_networks, "/mnt/bigscape_networks.csv")
bigscape_networks |>
head(30) |>
knitr::kable()
bgc_id_1 | bgc_id_2 | raw_distance | squared_similarity | cutoff | class |
---|---|---|---|---|---|
10I5_27.region001 | RTS3_6.region001 | 0.0001412 | 0.9997177 | 0.3 | /home/runner/work/_temp/Library/RauENVS/bigscape_example_directory/mix |
RTI3 | RTI3_47.region001 | 0.0000000 | 1.0000000 | 0.3 | /home/runner/work/_temp/Library/RauENVS/bigscape_example_directory/mix |
LG9s2new_12.region001 | RTB1new | 0.0075563 | 0.9849444 | 0.3 | /home/runner/work/_temp/Library/RauENVS/bigscape_example_directory/mix |
Cout2_1.region001 | RTI4new_10.region001 | 0.2747348 | 0.5260096 | 0.3 | /home/runner/work/_temp/Library/RauENVS/bigscape_example_directory/mix |
5I3new | RTC2s1_31.region001 | 0.0187363 | 0.9628785 | 0.3 | /home/runner/work/_temp/Library/RauENVS/bigscape_example_directory/mix |
5I2new_9.region001 | RTI2s1 | 0.2484404 | 0.5648419 | 0.3 | /home/runner/work/_temp/Library/RauENVS/bigscape_example_directory/mix |
Cout2 | Gout2_30.region001 | 0.0001264 | 0.9997472 | 0.3 | /home/runner/work/_temp/Library/RauENVS/bigscape_example_directory/mix |
CA11s2_11.region001 | Cout2_8.region001 | 0.2028430 | 0.6354593 | 0.3 | /home/runner/work/_temp/Library/RauENVS/bigscape_example_directory/mix |
5I2_1.region001 | CCC3s4 | 0.0000000 | 1.0000000 | 0.3 | /home/runner/work/_temp/Library/RauENVS/bigscape_example_directory/mix |
CCC2s1 | CCI2s3_7.region001 | 0.2955131 | 0.4963019 | 0.3 | /home/runner/work/_temp/Library/RauENVS/bigscape_example_directory/mix |
10C2_32.region001 | 10S3_3.region001 | 0.0004702 | 0.9990599 | 0.3 | /home/runner/work/_temp/Library/RauENVS/bigscape_example_directory/mix |
LG9s2new_1.region001 | RTB1_65.region001 | 0.0174108 | 0.9654816 | 0.3 | /home/runner/work/_temp/Library/RauENVS/bigscape_example_directory/mix |
RTI4_25.region001 | RTI4new | 0.0000000 | 1.0000000 | 0.3 | /home/runner/work/_temp/Library/RauENVS/bigscape_example_directory/mix |
5I2_11.region001 | 5I2new | 0.0000000 | 1.0000000 | 0.3 | /home/runner/work/_temp/Library/RauENVS/bigscape_example_directory/mix |
10I5 | RTC2s1new_69.region001 | 0.1055529 | 0.8000355 | 0.3 | /home/runner/work/_temp/Library/RauENVS/bigscape_example_directory/mix |
5I3_1.region001 | RTC2s1_3.region001 | 0.0515031 | 0.8996463 | 0.3 | /home/runner/work/_temp/Library/RauENVS/bigscape_example_directory/mix |
10S5 | 5S4_1.region002 | 0.0000000 | 1.0000000 | 0.3 | /home/runner/work/_temp/Library/RauENVS/bigscape_example_directory/mix |
10I1 | 10I1new_43.region001 | 0.0000000 | 1.0000000 | 0.3 | /home/runner/work/_temp/Library/RauENVS/bigscape_example_directory/mix |
10B2_97.region001 | 5C4 | 0.0000000 | 1.0000000 | 0.3 | /home/runner/work/_temp/Library/RauENVS/bigscape_example_directory/mix |
AA4_22.region001 | AA4new_32.region001 | 0.0000000 | 1.0000000 | 0.3 | /home/runner/work/_temp/Library/RauENVS/bigscape_example_directory/mix |
LG9s2new | LG9s2new_4.region001 | 0.0000000 | 1.0000000 | 0.3 | /home/runner/work/_temp/Library/RauENVS/bigscape_example_directory/mix |
10B2new_2.region001 | 5C4_19.region001 | 0.0000000 | 1.0000000 | 0.3 | /home/runner/work/_temp/Library/RauENVS/bigscape_example_directory/mix |
5S4_13.region001 | AB12_1.region002 | 0.0480534 | 0.9062024 | 0.3 | /home/runner/work/_temp/Library/RauENVS/bigscape_example_directory/mix |
AA4new_32.region001 | RTS4_82.region001 | 0.0009455 | 0.9981099 | 0.3 | /home/runner/work/_temp/Library/RauENVS/bigscape_example_directory/mix |
10S3_3.region002 | 5I3_2.region001 | 0.1382006 | 0.7426981 | 0.3 | /home/runner/work/_temp/Library/RauENVS/bigscape_example_directory/mix |
RTB1new | RTB1new_3.region001 | 0.0000000 | 1.0000000 | 0.3 | /home/runner/work/_temp/Library/RauENVS/bigscape_example_directory/mix |
CCC2s1 | CCC2s1_8.region001 | 0.0000000 | 1.0000000 | 0.3 | /home/runner/work/_temp/Library/RauENVS/bigscape_example_directory/mix |
10B2_97.region001 | 10I4_53.region001 | 0.0000000 | 1.0000000 | 0.3 | /home/runner/work/_temp/Library/RauENVS/bigscape_example_directory/mix |
LG9s2_19.region001 | RTB1_17.region001 | 0.0096855 | 0.9807229 | 0.3 | /home/runner/work/_temp/Library/RauENVS/bigscape_example_directory/mix |
RTI2s1 | RTI2s2new_5.region001 | 0.0000000 | 1.0000000 | 0.3 | /home/runner/work/_temp/Library/RauENVS/bigscape_example_directory/mix |