diff --git a/.github/workflows/check-standard.yaml b/.github/workflows/check-standard.yaml deleted file mode 100644 index 78b9a1ad..00000000 --- a/.github/workflows/check-standard.yaml +++ /dev/null @@ -1,48 +0,0 @@ -# Workflow derived from https://github.com/r-lib/actions/blob/v2.3.1/examples/check-standard.yaml -on: - push: - branches: [main] - pull_request: - branches: [main] - -name: R-CMD-check - -jobs: - R-CMD-check: - runs-on: ${{ matrix.config.os }} - - name: ${{ matrix.config.os }} (${{ matrix.config.r }}) - - strategy: - fail-fast: false - matrix: - config: - - {os: macos-latest, r: 'release'} - - {os: windows-latest, r: 'release'} - - {os: ubuntu-20.04, r: 'devel', http-user-agent: 'release'} - - {os: ubuntu-20.04, r: 'release'} - - {os: ubuntu-20.04, r: 'oldrel-1'} - - env: - GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }} - R_KEEP_PKG_SOURCE: yes - - steps: - - uses: actions/checkout@v3 - - - uses: r-lib/actions/setup-pandoc@v2 - - - uses: r-lib/actions/setup-r@v2 - with: - r-version: ${{ matrix.config.r }} - http-user-agent: ${{ matrix.config.http-user-agent }} - use-public-rspm: true - - - uses: r-lib/actions/setup-r-dependencies@v2 - with: - extra-packages: any::rcmdcheck - needs: check - - - uses: r-lib/actions/check-r-package@v2 - with: - upload-snapshots: true diff --git a/.github/workflows/joss.yaml b/.github/workflows/joss.yaml new file mode 100644 index 00000000..4d3a34ff --- /dev/null +++ b/.github/workflows/joss.yaml @@ -0,0 +1,23 @@ +on: [push] + +jobs: + paper: + runs-on: ubuntu-latest + name: JOSS + steps: + - name: Checkout + uses: actions/checkout@v3 + - name: Build draft PDF + uses: openjournals/openjournals-draft-action@master + with: + journal: joss + # This should be the path to the paper within your repo. + paper-path: vignettes/paper.md + - name: Upload + uses: actions/upload-artifact@v1 + with: + name: paper + # This is the output path where Pandoc will write the compiled + # PDF. Note, this should be the same directory as the input + # paper.md + path: vignettes/paper.pdf diff --git a/.github/workflows/pkgdown.yaml b/.github/workflows/pkgdown.yaml deleted file mode 100644 index 2b3a797c..00000000 --- a/.github/workflows/pkgdown.yaml +++ /dev/null @@ -1,51 +0,0 @@ -# Workflow derived from https://github.com/r-lib/actions/blob/v2.3.1/examples/pkgdown.yaml -on: - push: - branches: [main] - pull_request: - branches: [main] - release: - types: [published] - workflow_dispatch: - -name: pkgdown - -jobs: - # Build docs website - pkgdown: - runs-on: ubuntu-20.04 - # Only restrict concurrency for non-PR jobs - concurrency: - group: pkgdown-${{ github.event_name != 'pull_request' || github.run_id }} - env: - GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }} - steps: - - uses: actions/checkout@v3 - - - uses: r-lib/actions/setup-pandoc@v2 - - - uses: r-lib/actions/setup-r@v2 - with: - use-public-rspm: true - - - uses: r-lib/actions/setup-r-dependencies@v2 - with: - extra-packages: any::pkgdown, local::. - needs: website - - # creates pkgdown/assets/*.html, which will be copied to site in next step - - name: Create simulation Rmd docs examples in pkgdown/assets/ - run: rmarkdown::render("vignettes/simChef.Rmd") - shell: Rscript {0} - - - name: Build site - run: pkgdown::build_site_github_pages(new_process = FALSE, install = FALSE) - shell: Rscript {0} - - - name: Deploy to GitHub pages 🚀 - if: github.event_name != 'pull_request' - uses: JamesIves/github-pages-deploy-action@v4.4.1 - with: - clean: false - branch: gh-pages - folder: docs diff --git a/vignettes/api_overview.png b/vignettes/api_overview.png new file mode 100644 index 00000000..d139dd59 Binary files /dev/null and b/vignettes/api_overview.png differ diff --git a/vignettes/fit_eval_viz.png b/vignettes/fit_eval_viz.png new file mode 100644 index 00000000..1f471077 Binary files /dev/null and b/vignettes/fit_eval_viz.png differ diff --git a/vignettes/paper.bib b/vignettes/paper.bib new file mode 100644 index 00000000..247f2aae --- /dev/null +++ b/vignettes/paper.bib @@ -0,0 +1,247 @@ + +@article{yu-veridical-2020, + title = {Veridical data science}, + volume = {117}, + issn = {0027-8424, 1091-6490}, + url = {http://www.pnas.org/lookup/doi/10.1073/pnas.1901326117}, + doi = {10.1073/pnas.1901326117}, + language = {en}, + number = {8}, + urldate = {2021-09-04}, + journal = {Proceedings of the National Academy of Sciences}, + author = {Yu, Bin and Kumbier, Karl}, + month = feb, + year = {2020}, + pages = {3920--3929}, +} + +@article{lang-batchtools-2017, + title = {{batchtools: {Tools} for {R} to work on batch systems}}, + volume = {2}, + issn = {2475-9066}, + shorttitle = {batchtools}, + url = {https://joss.theoj.org/papers/10.21105/joss.00135}, + doi = {10.21105/joss.00135}, + language = {en}, + number = {10}, + urldate = {2023-04-20}, + journal = {Journal of Open Source Software}, + author = {Lang, Michel and Bischl, Bernd and Surmann, Dirk}, + month = feb, + year = {2017}, + pages = {135}, +} + +@article{wickham-welcome-2019, + title = {Welcome to the {Tidyverse}}, + volume = {4}, + issn = {2475-9066}, + url = {https://joss.theoj.org/papers/10.21105/joss.01686}, + doi = {10.21105/joss.01686}, + language = {en}, + number = {43}, + urldate = {2023-04-20}, + journal = {Journal of Open Source Software}, + author = {Wickham, Hadley and Averick, Mara and Bryan, Jennifer and Chang, Winston and McGowan, Lucy D'Agostino and François, Romain and Grolemund, Garrett and Hayes, Alex and Henry, Lionel and Hester, Jim and Kuhn, Max and Pedersen, Thomas Lin and Miller, Evan and Bache, Stephan Milton and Müller, Kirill and Ooms, Jeroen and Robinson, David and Seidel, Dana Paige and Spinu, Vitalie and Takahashi, Kohske and Vaughan, Davis and Wilke, Claus and Woo, Kara and Yutani, Hiroaki}, + month = nov, + year = {2019}, + pages = {1686}, +} + +@article{bengtsson-unifying-2021, + title = {A {Unifying} {Framework} for {Parallel} and {Distributed} {Processing} in {R} using {Futures}}, + volume = {13}, + issn = {2073-4859}, + url = {https://journal.r-project.org/archive/2021/RJ-2021-048/index.html}, + doi = {10.32614/RJ-2021-048}, + language = {en}, + number = {2}, + urldate = {2023-04-20}, + journal = {The R Journal}, + author = {Bengtsson, Henrik}, + year = {2021}, + pages = {208}, +} + +@Manual{chang-r6-2022, + title = {{R6: Encapsulated Classes with Reference Semantics}}, + author = {Winston Chang}, + year = {2022}, + url = {https://r6.r-lib.org}, +} + +@article{chalmers-simdesign-2020, + author = {Chalmers, R. Philip AND Adkins, Mark C. }, + journal = {The Quantitative Methods for Psychology}, + publisher = {TQMP}, + title = {{Writing Effective and Reliable Monte Carlo Simulations with the SimDesign Package}}, + year = {2020}, + volume = {16}, + number = {4}, + pages = {248-280}, + url = {http://www.tqmp.org/RegularArticles/vol16-4/p248/p248.pdf }, + doi = {10.20982/tqmp.16.4.p248}, +} + +@misc{kenny-simengine-2024, + title={{SimEngine: A Modular Framework for Statistical Simulations in R}}, + author={Avi Kenny and Charles J. Wolock}, + year={2024}, + eprint={2403.05698}, + archivePrefix={arXiv}, + primaryClass={stat.CO}, + url={https://doi.org/10.48550/arXiv.2403.05698}, + doi={10.48550/arXiv.2403.05698}, +} + +@Manual{brown-simpr-2023, + title = {{simpr: Flexible 'Tidyverse'-Friendly Simulations}}, + author = {Ethan Brown}, + year = {2023}, + url = {https://statisfactions.github.io/simpr/}, +} + +@article{gasparini-rsimsum-2018, + year = {2018}, + publisher = {The Open Journal}, + volume = {3}, + number = {26}, + pages = {739}, + author = {Gasparini, Alessandro}, + title = {{rsimsum: Summarise results from Monte Carlo simulation studies}}, + journal = {Journal of Open Source Software}, + url = {https://joss.theoj.org/papers/10.21105/joss.00739}, + doi = {10.21105/joss.00739}, +} + +@article{blair-declaredesign-2019, + Author = {Blair, Graeme and Cooper, Jasper and Coppock, Alexander and Humphreys, Macartan}, + Title = {{Declaring and Diagnosing Research Designs}}, + Journal = {American Political Science Review}, + Year = {2019}, + Volume = {113}, + Number = {3}, + Pages = {838--859}, + url = {https://doi.org/10.1017/S0003055419000194}, + doi = {10.1017/S0003055419000194}, +} + +@Manual{joshi-simhelpers-2024, + title = {{simhelpers: Helper Functions for Simulation Studies}}, + author = {Joshi, Megha and Pustejovsky, James}, + year = {2024}, + note = {R package version 0.2.0}, + url = {https://meghapsimatrix.github.io/simhelpers/index.html}, +} + +@Manual{scheer-simTool-2020, + title = {{simTool: Conduct Simulation Studies with a Minimal Amount of Source Code}}, + author = {Scheer, Marsel}, + year = {2020}, + note = {R package version 1.1.7}, + url = {https://CRAN.R-project.org/packages=simTool}, +} + +@Manual{epskamp-parSim-2024, + title = {{parSim: Parallel Simulation Studies}}, + author = {Sacha Epskamp}, + year = {2023}, + note = {R package version 0.1.5}, + url = {https://CRAN.R-project.org/package=parSim}, +} + +@Manual{shilane-simitation-2023, + title = {{simitation: Simplified Simulations}}, + author = {David Shilane and Srivastav Budugutta and Mayur Bansal}, + year = {2023}, + note = {R package version 0.0.7}, + url = {https://CRAN.R-project.org/package=simitation}, +} + +@Manual{linner-tidyMC-2022, + title = {{tidyMC: Monte Carlo Simulations Made Easy and Tidy}}, + author = {Stefan Linner and Ignacio {Moreira Lara} and Konstantin Lehmann}, + year = {2022}, + url = {https://github.com/stefanlinner/tidyMC}, +} + +@Article{ucar-simmer-2019, + title = {{simmer}: Discrete-Event Simulation for {R}}, + author = {I{\~n}aki Ucar and Bart Smeets and Arturo Azcorra}, + journal = {Journal of Statistical Software}, + year = {2019}, + volume = {90}, + number = {2}, + pages = {1--30}, + url = {https://dogi.org/10.18637/jss.v090.i02}, + doi = {10.18637/jss.v090.i02}, +} + +@Article{fatih-MonteCarloSEM-2021, + title = {{MonteCarloSEM: An R Package to Simulate Data for SEM}}, + author = {Orcan, Fatih}, + journal = {International Journal of Assessment Tools in Education}, + year = {2021}, + volume = {8}, + number = {3}, + pages = {704--713}, + url = {https://dergipark.org.tr/en/download/article-file/1323860}, + doi = {10.21449/ijate.804203}, +} + +@Misc{parsons-simMetric-2022, + title = {{simMetric: Metrics (with Uncertainty) for Simulation Studies that Evaluate Statistical Methods}}, + author = {Rex Parsons}, + publisher = {Queensland University of Technology}, + year = {2022}, + license = {MIT}, + url = {https://doi.org/10.25912/RDF_1665114451679}, + doi = {10.25912/RDF_1665114451679}, + note = {R package version 0.1.9000}, +} + +@misc{bien-simulator-2016, + title={The {Simulator}: An {Engine} to {Streamline} {Simulations}}, + author={Jacob Bien}, + year={2016}, + eprint={1607.00021}, + archivePrefix={arXiv}, + primaryClass={stat.CO}, + url={https://doi.org/10.48550/arXiv.1607.00021}, + doi={10.48550/arXiv.1607.00021}, +} + +@Article{couch-infer-2021, + title = {{infer}: An {R} package for tidyverse-friendly statistical inference}, + author = {Simon P. Couch and Andrew P. Bray and Chester Ismay and Evgeni Chasnovski and Benjamin S. Baumer and Mine Çetinkaya-Rundel}, + journal = {Journal of Open Source Software}, + year = {2021}, + volume = {6}, + number = {65}, + pages = {3661}, + url = {https://joss.theoj.org/papers/10.21105/joss.03661}, + doi = {10.21105/joss.03661}, +} + +@article{hofert-simsalapar-2016, + title={{Parallel and Other Simulations in R Made Easy: An End-to-End Study}}, + volume={69}, + url={https://doi.org/10.18637/jss.v069.i04}, + doi={10.18637/jss.v069.i04}, + number={4}, + journal={Journal of Statistical Software}, + author={Hofert, Marius and Mächler, Martin}, + year={2016}, + pages={1–44}, +} + +@misc{elliott-designing-2024, + title={Designing a Data Science simulation with MERITS: A Primer}, + author={Corrine F Elliott and James Duncan and Tiffany M Tang and Merle Behr and Karl Kumbier and Bin Yu}, + year={2024}, + eprint={2403.08971}, + archivePrefix={arXiv}, + primaryClass={stat.CO}, + url={https://arxiv.org/abs/2403.08971}, + doi={10.48550/arXiv.2403.08971}, +} diff --git a/vignettes/paper.md b/vignettes/paper.md new file mode 100644 index 00000000..841e2ceb --- /dev/null +++ b/vignettes/paper.md @@ -0,0 +1,301 @@ +--- +title: '`simChef`: High-quality data science simulations in `R`' +tags: + - simulations + - data science + - R +authors: + - name: James Duncan + orcid: 0000-0003-3297-681X + equal-contrib: true + affiliation: 1 + - name: Tiffany Tang + orcid: 0000-0002-8079-6867 + equal-contrib: true + corresponding: true + affiliation: 2 + - name: Corrine F. Elliott + orcid: 0000-0001-7935-9945 + affiliation: 2 + - name: Philippe Boileau + orcid: 0000-0002-4850-2507 + affiliation: 1 + - name: Bin Yu + affiliation: "1, 2, 3, 4" + orcid: 0000-0002-8888-4060 +affiliations: + - name: Graduate Group in Biostatistics, University of California, Berkeley, United States of America + index: 1 + - name: Department of Statistics, University of California, Berkeley, United States of America + index: 2 + - name: Department of Electrical Engineering and Computer Sciences, University of California, Berkeley, United States of America + index: 3 + - name: Center for Computational Biology, University of California, Berkeley, United States of America + index: 4 +date: 28 June 2023 +bibliography: paper.bib + +--- + +# Summary + +`simChef` is an `R` package that empowers data science practitioners to rapidly +plan, carry out, and summarize statistical simulation studies in a flexible, +efficient, and low-code manner. Drawing substantially from the Predictability, +Computability, and Stability (PCS) framework [@yu-veridical-2020], `simChef` +emphasizes the scientific best practices encompassed by PCS by removing many of +the administrative burdens of simulation design through: (1) an intuitive [tidy +grammar](https://design.tidyverse.org/) of data science simulations; (2) +powerful abstractions for distributed simulation processing backed by `future` +[@bengtsson-unifying-2021]; and (3) automated generation of interactive [R +Markdown](https://rmarkdown.rstudio.com/) simulation documentation, situating +results next to the workflows needed to reproduce them. Taken together, +`simChef`'s capabilities overcome many of the design, computational, and +reproducibility hurdles inherent in nearly every data science simulation study. + +# Statement of need + +Data science simulation studies occupy an important role in scientific research +as a means to gain insight into new and existing statistical methods. +Simulations serve as statistical sandboxes that open a path toward otherwise +inaccessible discoveries. For example, they can be used to establish +comprehensive benchmarks of existing procedures for a common task; to +demonstrate the strengths and weaknesses of novel methodology applied to +synthetic and real-world data; or to probe the validity of a theoretical +analysis. + +Creating high-quality simulation studies typically involves a number of +repetitive and error-prone coding tasks: implementing data-generating processes +(DGPs) and statistical methods; sampling from these DGPs; parallelizing +computation of simulation replicates; summarizing metrics; visualizing, +documenting, presenting, and saving results; and so on. While this +administrative overhead is necessary, it is not sufficient for scientific +understanding. Data scientists must navigate a number of important judgment +calls such as the choice of DGPs, baseline statistical methods, associated +parameters, and evaluation metrics for scientific relevancy. + +While the scientific context may vary drastically from one study to the next, +the simulation scaffolding remains largely similar. Yet simulation code +repositories often lack reusability, both for novel settings and when new +questions arise in the original context. `simChef` addresses the need for an +intuitive, extensible, and reusable framework for data science simulations, +allowing data science practitioners to focus their energies on scientific +questions by reducing the burdens of parameterization, parallelization, and +documentation. + +# Core abstractions of data science simulations + +At its core, `simChef` breaks down a simulation experiment into four modular components (\autoref{fig:api}), each implemented as an `R6` class [@chang-r6-2022]: + +- `DGP`: the data-generating processes from which to *generate* data +- `Method`: the methods (or models) to *fit* in the experiment +- `Evaluator`: the evaluation metrics used to *evaluate* the methods' performance +- `Visualizer`: the visualization functions used to *visualize* outputs from the method fits or evaluation results (can be tables, plots, or even `R` Markdown snippets to display) + +![Overview of the four core components in a `simChef` `Experiment`. `simChef` +provides four classes that implement distinct simulation objects in +an intuitive and modular manner: `DGP`, `Method`, `Evaluator`, and `Visualizer`. +Using these classes, users can easily build a `simChef` `Experiment` using reusable, customizable functions +(i.e., `dgp_fun`, `method_fun`, `eval_fun`, and `viz_fun`). +Optional named parameters can be set in these custom functions via the `...` arguments in the `create_*()` methods. +\label{fig:api}](api_overview.png){ width=100% } + +Using these classes, users can create or reuse custom functions (i.e., `dgp_fun`, `method_fun`, `eval_fun`, and `viz_fun` in \autoref{fig:api}) aligned with their scientific goals. +The custom functions then can be parameterized and encapsulated in one of the corresponding classes via a `create_*` method, together with optional named parameters (see \autoref{fig:api}). + +A fifth `R6` class, `Experiment`, unites the four components above and serves as a concrete implementation of the +user's intent to answer a specific scientific question. Specifically, the `Experiment` stores +references to the `DGP`(s), `Method`(s), `Evaluator`(s), and `Visualizer`(s) along with the `DGP` and `Method` +parameters that should be varied and combined during the simulation run. + +![Overview of running a `simChef` `Experiment`. The `Experiment` class handles relationships among the four classes portrayed in \autoref{fig:api}. Experiments may have multiple `DGP`s and `Method`s, which are combined across the Cartesian product of their varying parameters (represented by `\*`). Once computed, each `Evaluator` and `Visualizer` takes in the fitted simulation replicates, while `Visualizer` additionally receives evaluation summaries. +\label{fig:run-exper}](run_experiment.png){ width=100% } + +# A powerful grammar of data science simulations + +Inspired by the tidyverse [@wickham-welcome-2019], `simChef` develops an +intuitive grammar for running simulation studies using the aforementioned `R6` classes. +We provide an illustrative example usage next. + +```r +library(simChef) + +dgp1 <- create_dgp(dgp_fun1, "my_dgp1", sd = 0.5) +dgp2 <- create_dgp(dgp_fun2, "my_dgp2") +method <- create_method(method_fun, "my_method") +eval <- create_evaluator(eval_fun) +viz <- create_vizualizer(viz_fun) + +exper <- create_experiment(dgp_list = list(dgp1, dgp2)) %>% + add_method(method) %>% + add_vary_across( + list(dgp1, dgp2), + n = c(1e2, 1e3, 1e4) + ) %>% + add_vary_across( + dgp2, + sparse = c(FALSE, TRUE) + ) %>% + add_vary_across( + method, + scalar_valued_param = c(0.1, 1.0, 10.0), + vector_valued_param = list(c(1, 2, 3), c(4, 5, 6)), + list_valued_param = list(list(a1=1, a2=2, a3=3), + list(b1=3, b2=2, b3=1)) + ) %>% + add_evaluator(eval) %>% + add_viz(viz) + +future::plan(multicore, workers = 64) + +results <- exper %>% + run_experiment(n_reps = 100, save = TRUE) + +new_method <- create_method(new_method_fun, 'my_new_method') + +exper <- exper %>% + add_method(new_method) + +results <- exper %>% + run_experiment(n_reps = 100, use_cached = TRUE) + +init_docs(exper) +render_docs(exper) +``` + +In the example usage, `DGP`(s), `Method`(s), `Evaluator`(s), and `Visualizer`(s) are first created via `create_*()`. +These simulation objects can then be combined into an `Experiment` using either `create_experiment()` and/or `add_*()`. + +In an `Experiment`, `DGP`(s) and `Method`(s) can also be varied across one or multiple parameters via `add_vary_across()`. +For instance, in the example `Experiment`, there are two `DGP` instances, both of which are varied across three values of `n` and one of which is additionally varied across two values of `sparse`. +This effectively results in nine distinct configurations for data generation (i.e., 3 variations on `dgp1` + 3x2 variations on `dgp2`). +For the single `Method` in the experiment, we use three values of `scalar_valued_param`, two of `vector_valued_param`, and another two of `list_valued_param`, giving 12 distinct configurations. +Hence, there are a total of 9x12 = 108 DGP-method-parameter combinations in the `Experiment`. + +Thus far, we have simply instantiated an `Experiment` object (akin to creating a recipe for an experiment). +To compute and run the simulation experiment, we next call `run_experiment` with the desired number of replicates. +As summarized in \autoref{fig:run-exper}, running the experiment will +(1) *fit* each `Method` on each `DGP` (and for each of the varying parameter configurations), +(2) *evaluate* the experiment according to the given `Evaluator`(s), and +(3) *visualize* the experiment according to the given `Visualizer`(s). +Furthermore, the number of replicates per combination of `DGP`, `Method`, and parameters specified via `add_vary_across` is determined by the `n_reps` argument to `run_experiment`. +Because replication happens at the per-combination level, the effective total number of replicates in the `Experiment` depends on the number of DGPs, methods, and varied parameters. +In the given example, there are 108 DGP-method-parameter combinations, each of which is replicated 100 times. +To reduce the computational burden, the `Experiment` class flexibly handles the computation of simulation replicates in parallel using the `future` package [@bengtsson-unifying-2021]. +\autoref{fig:exper-schematic} provides a detailed schematic of the +`run_experiment` workflow, along with the expected inputs to and outputs from +user-defined functions. + +![Detailed schematic of the `run_experiment` +workflow using `simChef`. Expected inputs to and outputs from user-defined functions are also provided.\label{fig:exper-schematic}](fit_eval_viz.png){ width=100% } + + +# Additional Features + +In addition to the ease of parallelization, `simChef` enables caching of results to further alleviate the computational burden. +Here, users can choose to save the experiment's results to disk by passing `save = TRUE` to `run_experiment`. +Once saved, the user can add new `DGP` and `Method` objects to the experiment and compute additional replicates without re-computing existing results via the `use_cached` option. +Considering the example above, when we add `new_method` and call `run_experiment` with `use_cached = TRUE`, `simChef` finds that the cached results are missing combinations of `new_method`, existing DGPs, and their associated parameters, giving nine new configurations. +Replicates for the new combinations are then appended to the cached results. + +`simChef` also provides users with a convenient API to automatically generate an `R` Markdown document. +This documentation gathers the scientific details, summary tables, and visualizations side-by-side with the user's custom source code and parameters for data-generating processes, statistical methods, evaluation metrics, and plots. +A call to `init_docs` generates empty markdown files for the user to populate with their overarching simulation objectives and with descriptions of each of the `DGP`, `Method`, `Evaluator`, and `Visualizer` objects included in the `Experiment`. +Finally, a call to `render_docs` prepares the `R` Markdown document, either for iterative design and analysis of the simulation or to provide a high-quality overview that can be shared easily. +We provide an example of the simulation documentation [here](https://philboileau.github.io/simChef-case-study/results/empirical-fdr-comparison/empirical-fdr-comparison.html). +Corresponding `R` source code is available on [GitHub](https://github.com/PhilBoileau/simChef-case-study). + +# Related `R` packages + +A number of existing `R` packages and projects address needs related to +`simChef`'s functionality. At a higher level of abstraction, the `batchtools` +package [@lang-batchtools-2017] includes concepts for "problems", "algorithms", +and "experiments", similar to `simChef`'s `DGP`, `Method`, and `Experiment` +objects, respectively, but less tailored to the specific needs of data science +simulation experiments. Additionally, `batchtools` provides a number of +utilities for shared-memory and distributed memory computations, including for +interacting with high-performance computing cluster schedulers such as Slurm and +Torque. `simChef` is able to leverage these utilities for distributed +computations via the backends provided by the `future.batchtools` package which +is part of the `future` ecosystem of `R` packages [@bengtsson-unifying-2021]. +Whereas `batchtools` is a general tool for distributed mapping operations, +`simChef` specializes in data science simulations and provides additional +functionality tailored to that setting including its `tidy` grammar of +simulation experiments, the `Evaluator` and `Visualizer` concepts, and automated +documentation capabilities discussed above. + +Like `simChef`, many existing packages specifically aim to simplify the process +of creating simulation experiments by reducing coding burden through helpful +abstractions, distributed computing helpers, and preset methods for generating, +computing, and summarizing simulation replicates. Of particular note are the +following: + +- `SimDesign` [@chalmers-simdesign-2020] focuses on Monte Carlo simulation +experiments and provides a function `runSimulation` that accepts user-defined +`generate`, `analyse`, and `summarise` functions, with support for distributed +computation via the `parallel` base `R` package and `future`. +- `simulator` [@bien-simulator-2016] provides a `tidy` grammar of simulation +experiments and highly modular helpers for evaluating and managing simulation +outputs, relying on the `parallel` package for distributed computation. +- `simpr` [@brown-simpr-2023] defines a `tidy` simulation framework for +generating data, fitting models, varying parameters, and aggregating simulation +results with user-defined and `purr`-style functions. In addition, it support +distributed computations backed by the `future` framework. +- `SimEngine` [@kenny-simengine-2024] defines and executes simulation 'levels' +(parameters to vary) and 'scripts' (functions to execute a single simulation +replicate). It manages the definition and execution of simulations and +calculates summary statistics, with support for distributed computations in +coordination with high-performance computing cluster schedulers. + +A third category of related packages are those that share conceptual +similarities `simChef` in terms of providing helpful abstractions for the design +and analysis of simulation experiments, but at a finer level of detail than +`simChef` intends. For example, the package `DeclareDesign` +[@blair-declaredesign-2019] provides various `declare_*` functions for defining +and evaluating statistical research questions, with an emphasis on the social +sciences. The package `infer` [@couch-infer-2021] provides a `tidy` API for +statistical inference, providing the ability to specify random variables and +their relationships, define a null hypothesis, generate data under that +hypothesis, and calculate distributions of statistics based on that hypothesis. +Both of these packages and many of the packages below could be employed in a +user's `DGP`, `Method`, `Evaluator`, or `Visualizer` and deployed via an +`Experiment` to carry out a large-scale simulation with automated documentation +in harmony with `simChef`. + +Finally, many packages provide a small number of well-tailored helper functions +for specific data-generating processes and simulation settings, with or without +distributed computation. In no particular order these include: `simitation` +[@shilane-simitation-2023], `simhelpers` [@joshi-simhelpers-2024], `simTool` +[@scheer-simTool-2020], `parSim` [@epskamp-parSim-2024], `rsimsum` +[@gasparini-rsimsum-2018], `simsalapar` [@hofert-simsalapar-2016], `tidyMC` +[@linner-tidyMC-2022], `MonteCarloSEM` [@fatih-MonteCarloSEM-2021], `simMetric` +[@parsons-simMetric-2022], and `simmer` [@ucar-simmer-2019]. To our knowledge, +no single existing package includes `simChef`'s combination of conceptual +modularity, `tidy` grammar, computational flexibility, simulation workflow +management, and automated documentation. + + +# Discussion + +While `simChef`'s core functionality focuses on computability (C) -- +encompassing efficient usage of computational resources, ease of user +interaction, reproducibility, and documentation -- we emphasize the importance +of predictability (P) and stability (S) in data science simulations (see +[@elliott-designing-2024] for an in-depth discussion). The principal goal of +`simChef` is to provide a tool for data scientists to create simulations that +incorporate predictability (through fit to real-world data) and stability +(through sufficient exploration of uncertainty) in their simulations. In future +work, we intend to provide tools that can be flexibly tailored to a user's +particular scientific needs and further these goals through automated +predictability and stability summaries and documentation. + +# Acknowledgements + +The authors gratefully acknowledge partial support from (a) the NSF under awards +DMS-2209975, 1613002, 1953191, 2015341, and IIS 1741340; and grant 2023505 +supporting the Foundations of Data Science Institute (FODSI); (b) the Weill +Neurohub; and (c) the Chan Zuckerberg Biohub under an Intercampus Research +Award. TMT acknowledges support from the NSF Graduate Research Fellowship +Program DGE-2146752. + +# References diff --git a/vignettes/paper.pdf b/vignettes/paper.pdf new file mode 100644 index 00000000..53d7dd93 Binary files /dev/null and b/vignettes/paper.pdf differ diff --git a/vignettes/run_experiment.png b/vignettes/run_experiment.png new file mode 100644 index 00000000..bbf0aa60 Binary files /dev/null and b/vignettes/run_experiment.png differ diff --git a/vignettes/simChef-logo.png b/vignettes/simChef-logo.png new file mode 100644 index 00000000..de08a26f Binary files /dev/null and b/vignettes/simChef-logo.png differ