From 7b2d0aecbd45f9e97fd4031a9ac5ed250540b91f Mon Sep 17 00:00:00 2001 From: sou-cheng-choi Date: Wed, 19 Nov 2025 21:25:56 -0600 Subject: [PATCH 01/15] Fix Demo with QMCPy 2.0 --- .gitignore | 4 +++- LDData Demo.ipynb | 16 ++++------------ env.yml | 5 +++-- 3 files changed, 10 insertions(+), 15 deletions(-) diff --git a/.gitignore b/.gitignore index 0eff7ac..5bdb815 100644 --- a/.gitignore +++ b/.gitignore @@ -1,4 +1,6 @@ _ags/* *.DS_Store raw.githubusercontent.com/* -*.ipynb-checkpoints \ No newline at end of file +*.ipynb-checkpoints +/sc +.vscode/settings.json diff --git a/LDData Demo.ipynb b/LDData Demo.ipynb index f63d34c..7e24b2f 100644 --- a/LDData Demo.ipynb +++ b/LDData Demo.ipynb @@ -62,8 +62,8 @@ "name": "stderr", "output_type": "stream", "text": [ - "/Users/agsorok/miniconda3/envs/lddata/lib/python3.13/site-packages/qmcpy/discrete_distribution/lattice/lattice.py:257\n", - "\tParameterWarning: Non-randomized lattice sequence includes the origin\n" + "/Users/terrya/miniconda3/envs/lddata/lib/python3.13/site-packages/qmcpy/discrete_distribution/lattice/lattice.py:248: ParameterWarning: Without randomization, the first lattice point is the origin\n", + " warnings.warn(\"Without randomization, the first lattice point is the origin\",ParameterWarning)\n" ] }, { @@ -130,8 +130,8 @@ "name": "stderr", "output_type": "stream", "text": [ - "/Users/agsorok/miniconda3/envs/lddata/lib/python3.13/site-packages/qmcpy/discrete_distribution/digital_net_b2/digital_net_b2.py:389\n", - "\tParameterWarning: Non-randomized DigitalNetB2 sequence includes the origin\n" + "/Users/terrya/miniconda3/envs/lddata/lib/python3.13/site-packages/qmcpy/discrete_distribution/digital_net_b2/digital_net_b2.py:421: ParameterWarning: Without randomization, the first digtial net point is the origin\n", + " warnings.warn(\"Without randomization, the first digtial net point is the origin\",ParameterWarning)\n" ] }, { @@ -302,14 +302,6 @@ "generators = [qp.DigitalNetB2(d,randomize=False,generating_matrices=file) for file in files]\n", "plot_extensible_projections(generators,files,n=n)" ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "b8ef5511", - "metadata": {}, - "outputs": [], - "source": [] } ], "metadata": { diff --git a/env.yml b/env.yml index 8cdc4d3..f108ca0 100644 --- a/env.yml +++ b/env.yml @@ -101,8 +101,8 @@ dependencies: - python-json-logger==3.3.0 - pyyaml==6.0.2 - pyzmq==26.3.0 - - qmcpy==1.6.2 - - qmctoolscl==1.1.2 + - qmcpy==2.0 + - qmctoolscl==1.1.5 - referencing==0.36.2 - requests==2.32.3 - rfc3339-validator==0.1.4 @@ -126,3 +126,4 @@ dependencies: - webcolors==24.11.1 - webencodings==0.5.1 - websocket-client==1.8.0 + - huggingface_hub>=0.32.0 From 8adfd7853b883429ff450c2f391cf52bfd3f7f38 Mon Sep 17 00:00:00 2001 From: sou-cheng-choi Date: Thu, 20 Nov 2025 07:51:31 -0600 Subject: [PATCH 02/15] +line breaks --- README.md | 433 +++++++++++++++++++++++++++++++++++++++++++----------- 1 file changed, 350 insertions(+), 83 deletions(-) diff --git a/README.md b/README.md index e19149d..4d0d6f9 100644 --- a/README.md +++ b/README.md @@ -9,11 +9,24 @@ Low discrepancy generating vectors and matrices. ## General ideas and goals -We propose *standard formats* to specify lattice rules, polynomial lattice rules, and digital nets, in simple text files. We want the formats to be simple and relatively compact, with no more than one line per dimension, -so they can easily be used for point sets in several thousand dimensions if desired. Ordinary text files with decimal numbers are good enough. They are easy to read by both humans and computers in any language. Other specialized formats (Json or Parquet, for example) can be more compact but then -the files are not as easy to read without extra tools. - -Joe and Kuo (2008) provide a file for up to 21201 dimensions for Sobol' points, and this seems to be the most widely used set of parameters for RQMC points at this time. They use a fixed and very simple format, which requires no special software to read. We want to provide similar types of files for other types of point sets, for an arbitrarily large number of dimensions. The SSJ simulation library (L’Ecuyer, 2016) can already read some of our proposed formats. Latnet Builder (L’Ecuyer et al., 2022) can (or will) produce output files in these formats. The choice of output format for Latnet Builder can be specified on the command line, using the following keywords: +We propose *standard formats* to specify lattice rules, polynomial lattice +rules, and digital nets, in simple text files. We want the formats to be simple +and relatively compact, with no more than one line per dimension, so they can +easily be used for point sets in several thousand dimensions if desired. +Ordinary text files with decimal numbers are good enough. They are easy to read +by both humans and computers in any language. Other specialized formats (Json or +Parquet, for example) can be more compact but then the files are not as easy to +read without extra tools. + +Joe and Kuo (2008) provide a file for up to 21201 dimensions for Sobol' points, +and this seems to be the most widely used set of parameters for RQMC points at +this time. They use a fixed and very simple format, which requires no special +software to read. We want to provide similar types of files for other types of +point sets, for an arbitrarily large number of dimensions. The SSJ simulation +library (L’Ecuyer, 2016) can already read some of our proposed formats. Latnet +Builder (L’Ecuyer et al., 2022) can (or will) produce output files in these +formats. The choice of output format for Latnet Builder can be specified on the +command line, using the following keywords: - `lattice` A lattice rule: give the modulus and the generating vector. @@ -27,26 +40,67 @@ Sobol' points (default format), give only the direction numbers. Sobol' points, the format used by Joe and Kuo (2008). -The most important formats are the first two, since the point sets covered by the other formats are special cases of digital nets, so they can all be described by the `dnet` format. We propose them because they provide alternative representations that are either more compact or commonly used. +The most important formats are the first two, since the point sets covered by +the other formats are special cases of digital nets, so they can all be +described by the `dnet` format. We propose them because they provide alternative +representations that are either more compact or commonly used. All the point sets that we consider have the form $$P_n = \{\boldsymbol{u}_i \in [0,1)^s,\; i=0,\dots,n-1\}$$ -where $n$ is the number of points and $s$ is the number of dimensions. The dimension $j$ goes from 1 to $s$ and there are $n$ points enumerated by $i$ going from 0 to $n-1$. (In computer code, $j$ usually starts at 0, whereas in math papers, it starts at 1; one must be careful about this discrepancy.) The .txt files that contain the parameters have one line per dimension, preceded by a few lines that contain general parameters, such as $s$, $n$, etc. We shall call these lines the *header* of the file. In the header, additional lines that start with `#` can be used for comments and descriptions; these lines are totally optional and should be just skipped by the program that reads the file. Anything that starts with `#` on any given line in the header should also be skipped. All these comments are only for human readers to better see what is in the file, they are not for the computer[^1]. One exception: the first line of the file must be a comment that contains the keyword for the file type; for example `dnet` for a digital net. The number of dimensions (number of lines after the header) can be much larger than what we usually need; it suffices to use the number of rows that are needed. +where $n$ is the number of points and $s$ is the number of dimensions. The +dimension $j$ goes from 1 to $s$ and there are $n$ points enumerated by $i$ +going from 0 to $n-1$. (In computer code, $j$ usually starts at 0, whereas in +math papers, it starts at 1; one must be careful about this discrepancy.) The +.txt files that contain the parameters have one line per dimension, preceded by +a few lines that contain general parameters, such as $s$, $n$, etc. We shall +call these lines the *header* of the file. In the header, additional lines that +start with `#` can be used for comments and descriptions; these lines are +totally optional and should be just skipped by the program that reads the file. +Anything that starts with `#` on any given line in the header should also be +skipped. All these comments are only for human readers to better see what is in +the file, they are not for the computer[^1]. One exception: the first line of +the file must be a comment that contains the keyword for the file type; for +example `dnet` for a digital net. The number of dimensions (number of lines +after the header) can be much larger than what we usually need; it suffices to +use the number of rows that are needed. [^1]: Comments are now allowed only in the header lines, not in the $s$ lines that follow. This makes more sense. -The point sets can be extensible in the number of points $n$ or not (they can be constructed for a single $n$ only). Sobol points are extensible ad infinitum, although they are very good only when $n$ is a power of 2. Other types of point sets can also also be extensible, but are usually constructed to be good only for $n = b^k$ for $k$ in a given integer range, e.g., from 10 to 20, where $b \ge 2$ is the base. They satisfy the property that $P_{b^k}$ contains $P_{b^{k-1}}$ for all $k$ in this range. We call them *embedded* point sets. For these types of point sets it is highly recommended to specify the range in a comment in the header of the file [^2]. - -[^2]: The range for which the points were built could be given in the file, but this makes things a bit more complicated for some point sets. For example, for ordinary lattice rules with a prime number of points, this additional info might be confusing for some users. For Sobol points, the range has no limit. - -In the proposed formats, the files do not assume a given computer word size (e.g., 32 bits or 64 bits). The format is exactly the same regardless of the word size. Of course, if the file contains integers of more than 32 bits, the corresponding points cannot be generated properly on a 32 bit computer. A comment in the file header can say it. - -Some users might prefer input files with no header at all, only the $s$ lines that give the generating vector or generating matrices. In some languages (e.g., MATLAB), such a file can be read into a matrix by a simple "load file" command, so there is no need to to write any code to read the file. Users who want that can simply strip out the header from the files in standard format and use these naked files privately. We think that the header with human-readable comments as imposed by the standard will be very useful to many users. - -The following sections describe the proposed text-file formats for the different point sets. +The point sets can be extensible in the number of points $n$ or not (they can be +constructed for a single $n$ only). Sobol points are extensible ad infinitum, +although they are very good only when $n$ is a power of 2. Other types of point +sets can also also be extensible, but are usually constructed to be good only +for $n = b^k$ for $k$ in a given integer range, e.g., from 10 to 20, where $b +\ge 2$ is the base. They satisfy the property that $P_{b^k}$ contains +$P_{b^{k-1}}$ for all $k$ in this range. We call them *embedded* point sets. For +these types of point sets it is highly recommended to specify the range in a +comment in the header of the file [^2]. + +[^2]: The range for which the points were built could be given in the file, but + this makes things a bit more complicated for some point sets. For example, + for ordinary lattice rules with a prime number of points, this additional + info might be confusing for some users. For Sobol points, the range has no + limit. + +In the proposed formats, the files do not assume a given computer word size +(e.g., 32 bits or 64 bits). The format is exactly the same regardless of the +word size. Of course, if the file contains integers of more than 32 bits, the +corresponding points cannot be generated properly on a 32 bit computer. A +comment in the file header can say it. + +Some users might prefer input files with no header at all, only the $s$ lines +that give the generating vector or generating matrices. In some languages +(e.g., MATLAB), such a file can be read into a matrix by a simple "load file" +command, so there is no need to to write any code to read the file. Users who +want that can simply strip out the header from the files in standard format and +use these naked files privately. We think that the header with human-readable +comments as imposed by the standard will be very useful to many users. + +The following sections describe the proposed text-file formats for the different +point sets. ## Types of point sets and notation @@ -58,14 +112,31 @@ For an ordinary *lattice rule of rank 1*, we have $$P_n = \{\boldsymbol{u}_i = (i \boldsymbol{a} \bmod n)/n,\; i=0,\dots,n-1\}$$ -where $\boldsymbol{a} = (a_1,\dots,a_s)$ is the generating vector. We must specify $s$, $n$, and $\boldsymbol{a}$. - -In a `lattice` file, the first line must start with `# lattice`. After that, not counting the comment lines, the first line gives the number $s$ of dimensions, the second line gives the number $n$ of points, and lines 3 to $s+2$ give the coefficients $a_1,\dots, a_s$, one value per line. In the case of embedded lattice rules, $n$ would usually be a power of 2, say $n=2^k$, and the smaller embedded lattices will contain the first $2^{k-1}$ points, the first $2^{k-2}$ points, etc. -Additional comments in the file should tell when the lattice is embedded, which figure of merit and what weights were used, the construction method, etc. [^3] - -[^3]: Should we be forced to always put the embedding range in the file for the computer to read? My suggestion is not to force the computer to read it, but just put it as a comment for humans, Otherwise, it will force an additional line that gives the base and the range. What would we put for example if $n$ is prime and the rule is not embedded? Should we have a `lattice2` format for embedded rules in base 2? Putting more options makes things more complicated. - -One example of a parameter file for an ordinary lattice rule, in `lattice` format is given below. In this file, the first line is skipped, only the number `8` is read on the second line, only the number `65536` is read on the second line, etc. +where $\boldsymbol{a} = (a_1,\dots,a_s)$ is the generating vector. We must +specify $s$, $n$, and $\boldsymbol{a}$. + +In a `lattice` file, the first line must start with `# lattice`. After that, not +counting the comment lines, the first line gives the number $s$ of dimensions, +the second line gives the number $n$ of points, and lines 3 to $s+2$ give the +coefficients $a_1,\dots, a_s$, one value per line. In the case of embedded +lattice rules, $n$ would usually be a power of 2, say $n=2^k$, and the smaller +embedded lattices will contain the first $2^{k-1}$ points, the first $2^{k-2}$ +points, etc. Additional comments in the file should tell when the lattice is +embedded, which figure of merit and what weights were used, the construction +method, etc. [^3] + +[^3]: Should we be forced to always put the embedding range in the file for the + computer to read? My suggestion is not to force the computer to read it, + but just put it as a comment for humans, Otherwise, it will force an + additional line that gives the base and the range. What would we put for + example if $n$ is prime and the rule is not embedded? Should we have a + `lattice2` format for embedded rules in base 2? Putting more options makes + things more complicated. + +One example of a parameter file for an ordinary lattice rule, in `lattice` +format is given below. In this file, the first line is skipped, only the number +`8` is read on the second line, only the number `65536` is read on the second +line, etc. ``` # lattice @@ -85,7 +156,12 @@ One example of a parameter file for an ordinary lattice rule, in `lattice` forma ### Parameters for digital nets: `dnet` -A *digital net in base* $b$ with $n=b^k$ points is defined by selecting integers $s \geq 1$, $r \geq k \geq 1$, and $s$ matrices $\boldsymbol{C}_1,\dots,\boldsymbol{C}_s$ of size $r \times k$ with entries in $\mathbb{Z}_b$, called the generating matrices. For $i=0,\dots,n-1$, let $i = \sum_{\ell=0}^{k-1} a_{i,\ell} b^\ell$ be the expansion of $i$ in base $b$, and for $j=1,\dots s$, let +A *digital net in base* $b$ with $n=b^k$ points is defined by selecting integers +$s \geq 1$, $r \geq k \geq 1$, and $s$ matrices +$\boldsymbol{C}_1,\dots,\boldsymbol{C}_s$ of size $r \times k$ with entries in +$\mathbb{Z}_b$, called the generating matrices. For $i=0,\dots,n-1$, let $i = +\sum_{\ell=0}^{k-1} a_{i,\ell} b^\ell$ be the expansion of $i$ in base $b$, and +for $j=1,\dots s$, let $$(y_{i, j, 1}, \dots, y_{i, j, r})^T = \boldsymbol{C}_j \cdot (a_{i, 0}, \dots, a_{i, k-1})^T$$ @@ -93,12 +169,34 @@ and $$u_{i, j} = \sum_{\ell=1}^{r} y_{i, j, \ell} b^{-\ell}.$$ -The points $\boldsymbol{u}_i$ are defined by $\boldsymbol{u}_i = (u_{i,1},\dots,u_{i,s})$. Digital nets are usually in base $b=2$, but we allow a general (typically prime) base $b \ge 2$. - -The proposed format to specify digital nets is as follows. The first line must start with `# dnet`. Then the first four non-comment lines give $b$ (the base), $s$ (the number of dimensions), $k$ (the number of columns), and $r$ (the number of rows in the generating matrices in base $b$). Thus, the output values will have "precision" $b^{-r}$ (they will be integer multiples of $b^{-r}$). For $b=2$, a common value in the past has been $r=31$ when using 32 bit integers, but going forward we should use 64 bit integers and $r=63$ or 64, or perhaps $r=53$ to exploit the full accuracy of a `double`. By looking at $r$, one can see right away whether this file is good for 64 bit computers only or for 32 bit computers as well. - -The $s$ lines after this header will contain the $s$ generating matrices, one per line. Each of these lines contains $k$ integers smaller than $b^r$ giving the $k$ columns of $\boldsymbol{C}_j$, using by default the same encoding as in the class `DigitalNetBase2` in SSJ for $b=2$. That is, the base $b$ representation of the integer gives the $r$ digits in the corresponding column, with the digit on the first row of the matrix (row 0) being the most significant, and the one on the last row (row $r-1$) being the least significant. For example, if $b=2$, $r=31$, and the first column has a 1 in the first row and 0 in all other rows, -as is always the case for Sobol points, then the integer representation of this column will be $2^{30} = 1\,073\,741\,824$. If there is a 1 in the last row and 0 elsewhere, the representation will be $2^0 = 1$. If all 31 elements of the column are 1, the representation will be $2^{31}-1$. +The points $\boldsymbol{u}_i$ are defined by $\boldsymbol{u}_i = +(u_{i,1},\dots,u_{i,s})$. Digital nets are usually in base $b=2$, but we allow a +general (typically prime) base $b \ge 2$. + +The proposed format to specify digital nets is as follows. The first line must +start with `# dnet`. Then the first four non-comment lines give $b$ (the base), +$s$ (the number of dimensions), $k$ (the number of columns), and $r$ (the number +of rows in the generating matrices in base $b$). Thus, the output values will +have "precision" $b^{-r}$ (they will be integer multiples of $b^{-r}$). For +$b=2$, a common value in the past has been $r=31$ when using 32 bit integers, +but going forward we should use 64 bit integers and $r=63$ or 64, or perhaps +$r=53$ to exploit the full accuracy of a `double`. By looking at $r$, one can +see right away whether this file is good for 64 bit computers only or for 32 bit +computers as well. + +The $s$ lines after this header will contain the $s$ generating matrices, one +per line. Each of these lines contains $k$ integers smaller than $b^r$ giving +the $k$ columns of $\boldsymbol{C}_j$, using by default the same encoding as in +the class `DigitalNetBase2` in SSJ for $b=2$. That is, the base $b$ +representation of the integer gives the $r$ digits in the corresponding column, +with the digit on the first row of the matrix (row 0) being the most +significant, and the one on the last row (row $r-1$) being the least +significant. For example, if $b=2$, $r=31$, and the first column has a 1 in the +first row and 0 in all other rows, as is always the case for Sobol points, then +the integer representation of this column will be $2^{30} = 1\,073\,741\,824$. +If there is a 1 in the last row and 0 elsewhere, the representation will be $2^0 += 1$. If all 31 elements of the column are 1, the representation will be +$2^{31}-1$. One example of a file for a digital net in `dnet` format: @@ -115,9 +213,21 @@ One example of a file for a digital net in `dnet` format: ... ``` -This differs from Joe and Kuo (2008), where the $c$-th column (for $c\ge 1$) is represented by an integer smaller than $2^c$ (in base 2) and the least significant bit is the one on the diagonal. Their representation works when $\boldsymbol{C}_j$ is upper triangular, which is true for Sobol point sets, but not for digital nets in general. - -Recall that coordinate $j$ of the $i$-th point is obtained by multiplying the base $b$ matrix $\boldsymbol{C}_j$ by the vector of digits of the representation of $i$ in base $b$, with the least significant digits of $i$ at the top. That is, the least significant digit of $i$ goes with the first column of $\boldsymbol{C}_j$. And the first row of $\boldsymbol{C}_j$ is for the most significant digit of output. With our representation of $\boldsymbol{C}_j$ by $k$ integers, the points are easy and fast to generate in base 2. We obtain `u[i,j]`, coordinate $j$ of point $i$, with the following code snippet, in which `C[j,c]` is the integer that represents column $c$ of $\boldsymbol{C}_j$: +This differs from Joe and Kuo (2008), where the $c$-th column (for $c\ge 1$) is +represented by an integer smaller than $2^c$ (in base 2) and the least +significant bit is the one on the diagonal. Their representation works when +$\boldsymbol{C}_j$ is upper triangular, which is true for Sobol point sets, but +not for digital nets in general. + +Recall that coordinate $j$ of the $i$-th point is obtained by multiplying the +base $b$ matrix $\boldsymbol{C}_j$ by the vector of digits of the representation +of $i$ in base $b$, with the least significant digits of $i$ at the top. That +is, the least significant digit of $i$ goes with the first column of +$\boldsymbol{C}_j$. And the first row of $\boldsymbol{C}_j$ is for the most +significant digit of output. With our representation of $\boldsymbol{C}_j$ by +$k$ integers, the points are easy and fast to generate in base 2. We obtain +`u[i,j]`, coordinate $j$ of point $i$, with the following code snippet, in which +`C[j,c]` is the integer that represents column $c$ of $\boldsymbol{C}_j$: ```python normFactor = 1.0 / (1 << r) # 2^(-r) @@ -130,19 +240,45 @@ for c in range(k): ### Parameters for polynomial lattice rules: `plattice` -*Polynomial lattice rules* are a special type of digital nets with generating matrices of a special form. For a polynomial lattice rule of rank 1 in a prime base $b$, we have +*Polynomial lattice rules* are a special type of digital nets with generating +matrices of a special form. For a polynomial lattice rule of rank 1 in a prime +base $b$, we have -$$P_n = \left\{\left(\varphi\left(\frac{h(z) a_1(z)}{Q(z)}\right), \dots, \varphi\left(\frac{h(z) a_s(z)}{Q(z)}\right)\right) : h(z)\in\mathbb{F}_b[z], \ \text{degree}(h(z)) < k\right\}.$$ +$$P_n = \left\{\left(\varphi\left(\frac{h(z) a_1(z)}{Q(z)}\right), \dots, +\varphi\left(\frac{h(z) a_s(z)}{Q(z)}\right)\right) : h(z)\in\mathbb{F}_b[z], \ +\text{degree}(h(z)) < k\right\}.$$ -where $\mathbb{F}_b[z]$ is the space of polynomials with coefficients in $\mathbb{F}_b$, the *modulus* $Q(z) \in \mathbb{F}_b[z]$ is a polynomial of degree $k$, the *generating vector* $\boldsymbol{a}(z) = (a_1(z),\dots,a_s(z))\in \mathbb{F}_b[z]^s$ is a vector of $s$ polynomials of degrees less than $k$, and the mapping $\varphi$ is defined by +where $\mathbb{F}_b[z]$ is the space of polynomials with coefficients in +$\mathbb{F}_b$, the *modulus* $Q(z) \in \mathbb{F}_b[z]$ is a polynomial of +degree $k$, the *generating vector* $\boldsymbol{a}(z) = +(a_1(z),\dots,a_s(z))\in \mathbb{F}_b[z]^s$ is a vector of $s$ polynomials of +degrees less than $k$, and the mapping $\varphi$ is defined by -$$\varphi\left(\sum_{l = w}^{\infty} x_l z^{-l}\right) = \sum_{l = \max(w, 1)}^{\infty} x_l b^{-l}.$$ +$$\varphi\left(\sum_{l = w}^{\infty} x_l z^{-l}\right) = \sum_{l = \max(w, +1)}^{\infty} x_l b^{-l}.$$ This point set has $n = b^k$ points. -We must specify the polynomial modulus $Q(z)$ and the polynomial generating vector $\boldsymbol{a}(z)$. The polynomial modulus will be represented as an integer that has $(k+1)$ digits in base $b$, and all the other polynomials will be represented as integers that have no more than $k$ digits in base $b$. All these integers will be given in base 10 in the file, one per line. In practice, we usually have $b=2$, so $k$ represents the number of bits. The integer that represents a polynomial is obtained simply by replacing the formal variable by $b$. For example, if the polynomial is $Q(z) = z^4 + z^3 + 1$ and $b=2$, its coefficients are "1 1 0 0 1" and its integer representation is $2^4 + 2^3 + 1 = 25$. This is the usual representation, as used in Goda and Dick (2015), for example. In the case of embedded point sets, the modulus should be $Q(z) = z^k$ for $n=b^k$ points, and its integer representation is $b^k$. In particular, $Q(z) = z$ is represented by the integer $b$. - -As usual, the first line is a comment that tells the type of file. Then the first four non-comment lines give the base $b$, the number $s$ of dimensions, the degree $k$ of the polynomial modulus, and the integer representation of this polynomial. Lines 5 to $s+4$ give the polynomials that form the generating vector, one per line, using the integer representation just explained. One example of a file for a polynomial lattice in the `plattice` format: +We must specify the polynomial modulus $Q(z)$ and the polynomial generating +vector $\boldsymbol{a}(z)$. The polynomial modulus will be represented as an +integer that has $(k+1)$ digits in base $b$, and all the other polynomials will +be represented as integers that have no more than $k$ digits in base $b$. All +these integers will be given in base 10 in the file, one per line. In practice, +we usually have $b=2$, so $k$ represents the number of bits. The integer that +represents a polynomial is obtained simply by replacing the formal variable by +$b$. For example, if the polynomial is $Q(z) = z^4 + z^3 + 1$ and $b=2$, its +coefficients are "1 1 0 0 1" and its integer representation is $2^4 + 2^3 + 1 = +25$. This is the usual representation, as used in Goda and Dick (2015), for +example. In the case of embedded point sets, the modulus should be $Q(z) = z^k$ +for $n=b^k$ points, and its integer representation is $b^k$. In particular, +$Q(z) = z$ is represented by the integer $b$. + +As usual, the first line is a comment that tells the type of file. Then the +first four non-comment lines give the base $b$, the number $s$ of dimensions, +the degree $k$ of the polynomial modulus, and the integer representation of this +polynomial. Lines 5 to $s+4$ give the polynomials that form the generating +vector, one per line, using the integer representation just explained. One +example of a file for a polynomial lattice in the `plattice` format: ``` # plattice @@ -162,23 +298,66 @@ As usual, the first line is a comment that tells the type of file. Then the firs 17213 ``` -A polynomial lattice rule in base $b$ can also be represented as a digital net in base $b$, so its parameters can also be provided in a file in the `dnet` format, as for general digital net in base $b$. But the generating matrices have a special form and the above representation is much more compact (a single integer per row instead of $k$ integers per row). On the other hand, generating the points is faster with the generating matrices than with the polynomial representation, so the software that will use the `plattice` files and generate the points would usually first convert the polynomials into the corresponding generating matrices. LatNet Builder (L’Ecuyer et al., 2022) is also able to make the conversion and produce a file in the `dnet` format, for more convenience and better flexibility, so the user can select the format she/he prefers. +A polynomial lattice rule in base $b$ can also be represented as a digital net +in base $b$, so its parameters can also be provided in a file in the `dnet` +format, as for general digital net in base $b$. But the generating matrices have +a special form and the above representation is much more compact (a single +integer per row instead of $k$ integers per row). On the other hand, generating +the points is faster with the generating matrices than with the polynomial +representation, so the software that will use the `plattice` files and generate +the points would usually first convert the polynomials into the corresponding +generating matrices. LatNet Builder (L’Ecuyer et al., 2022) is also able to make +the conversion and produce a file in the `dnet` format, for more convenience and +better flexibility, so the user can select the format she/he prefers. ### Parameters for Sobol nets: `sobol` and `soboljk` -The Sobol' construction provides another special case of digital nets (and sequences), in base 2. They are defined in many places, including Joe and Kuo (2008). For each coordinate $j$, we select a primitive polynomial $p_j(z)$ of degree $c_j$, and $c_j$ integers $m_{j,1},\dots,m_{j,c_j}$ which are used to define the generating matrix $\boldsymbol{C}_j$. The real numbers $2^{-c} m_{j,c}$ are called the -initial *direction numbers*. More details are given in Joe and Kuo (2008) and [here](http://umontreal-simul.github.io/ssj/docs/master/classumontreal_1_1ssj_1_1hups_1_1SobolSequence.html). - -One obvious option for these point sets is to adopt exactly the same format as Joe and Kuo (2008), because it is already used in many places. The only difference is that we now allow comment lines in the file. In the format of Joe and Kuo (2008), only the first line is skipped. In the proposed format, other comment lines can be added at the beginning of the file, e.g., to give the maximum number of dimensions in the file, the criterion and weights that were used, etc. -Note that Sobol' sequences have an infinite number of points and an unlimited number of dimensions, although the file will give parameters for a finite number of dimensions. - -The other lines of the file specify the primitive polynomials and the initial direction numbers for each dimension $j \ge 2$, one line per dimension. For dimension $j=1$, the generating matrix is the identity and is not given in the file (it is implicit). The columns of this matrix are not obtained via a recurrence based on a primitive polynomial, -so this matrix is handled separately. - -The first number on each line is the dimension $j$. -The second number is the degree $c_j$ of the primitive polynomial $p_j(x)$ used for this dimension. The third number is the integer that corresponds to the binary representation of the inner coefficients of this polynomial (we ignore the first and last coefficients, they are always 1). For example, if the polynomial is $p_j(x) = x^4 + x^3 + 1$, the coefficients are "1 1 0 0 1", and after removing the first and last "1", we get 100 in base 2, which is 4, so the third column would contain the number 4. (Without removing the first and last "1", the number would be 25 instead.) After these three numbers, there are $c_j$ integers $m_{j,1},\dots,m_{j,c_j}$ where $m_{j,c}$ is the $c^\text{th}$ (real-valued) initial direction number for this coordinate, multiplied by $2^c$ to obtain an integer. This $m_{j,c}$ is the integer formed by taking the bits in row 1 to row $c$ of column $c$, in this order. The last bit is the bit on the diagonal, which is always 1, so all $m_{j,c}$'s are odd integers. I think this format comes from Bratley and Fox (1988). - -We denote this format for Sobol parameters by the `soboljk` keyword. One example of a file in this format is shown below. The first line gives the type of file and the next three lines are comments that must be skipped by the reading program. +The Sobol' construction provides another special case of digital nets (and +sequences), in base 2. They are defined in many places, including Joe and Kuo +(2008). For each coordinate $j$, we select a primitive polynomial $p_j(z)$ of +degree $c_j$, and $c_j$ integers $m_{j,1},\dots,m_{j,c_j}$ which are used to +define the generating matrix $\boldsymbol{C}_j$. The real numbers $2^{-c} +m_{j,c}$ are called the initial *direction numbers*. More details are given in +Joe and Kuo (2008) and +[here](http://umontreal-simul.github.io/ssj/docs/master/classumontreal_1_1ssj_1_1hups_1_1SobolSequence.html). + +One obvious option for these point sets is to adopt exactly the same format as +Joe and Kuo (2008), because it is already used in many places. The only +difference is that we now allow comment lines in the file. In the format of Joe +and Kuo (2008), only the first line is skipped. In the proposed format, other +comment lines can be added at the beginning of the file, e.g., to give the +maximum number of dimensions in the file, the criterion and weights that were +used, etc. Note that Sobol' sequences have an infinite number of points and an +unlimited number of dimensions, although the file will give parameters for a +finite number of dimensions. + +The other lines of the file specify the primitive polynomials and the initial +direction numbers for each dimension $j \ge 2$, one line per dimension. For +dimension $j=1$, the generating matrix is the identity and is not given in the +file (it is implicit). The columns of this matrix are not obtained via a +recurrence based on a primitive polynomial, so this matrix is handled +separately. + +The first number on each line is the dimension $j$. The second number is the +degree $c_j$ of the primitive polynomial $p_j(x)$ used for this dimension. The +third number is the integer that corresponds to the binary representation of the +inner coefficients of this polynomial (we ignore the first and last +coefficients, they are always 1). For example, if the polynomial is $p_j(x) = +x^4 + x^3 + 1$, the coefficients are "1 1 0 0 1", and after removing the first +and last "1", we get 100 in base 2, which is 4, so the third column would +contain the number 4. (Without removing the first and last "1", the number would +be 25 instead.) After these three numbers, there are $c_j$ integers +$m_{j,1},\dots,m_{j,c_j}$ where $m_{j,c}$ is the $c^\text{th}$ (real-valued) +initial direction number for this coordinate, multiplied by $2^c$ to obtain an +integer. This $m_{j,c}$ is the integer formed by taking the bits in row 1 to row +$c$ of column $c$, in this order. The last bit is the bit on the diagonal, which +is always 1, so all $m_{j,c}$'s are odd integers. I think this format comes from +Bratley and Fox (1988). + +We denote this format for Sobol parameters by the `soboljk` keyword. One example +of a file in this format is shown below. The first line gives the type of file +and the next three lines are comments that must be skipped by the reading +program. ``` # soboljk @@ -194,7 +373,17 @@ We denote this format for Sobol parameters by the `soboljk` keyword. One example 8 5 2 1 1 5 5 17 ``` -The `soboljk` format can be simplified as follows. First, removing the first and last "1" in the representation of the primitive polynomials saves a bit of memory, but it also makes thinks slightly more complicated. In the default representations of the primitive polynomials in the code that generates the points, these bits are usually not removed. In SSJ, the first thing we do when reading a file in `soboljk` format is to add them back. Also, the primitive polynomials can be in a separate file, since they never change, and only the (initial) direction numbers (those depend on the selected FOM and weights) would be given to specify the Sobol' points. That is, we remove the first three columns of the `soboljk` format. The Magic Point Shop (Nuyens, 2020) also produces files that contain only the direction numbers. +The `soboljk` format can be simplified as follows. First, removing the first and +last "1" in the representation of the primitive polynomials saves a bit of +memory, but it also makes thinks slightly more complicated. In the default +representations of the primitive polynomials in the code that generates the +points, these bits are usually not removed. In SSJ, the first thing we do when +reading a file in `soboljk` format is to add them back. Also, the primitive +polynomials can be in a separate file, since they never change, and only the +(initial) direction numbers (those depend on the selected FOM and weights) would +be given to specify the Sobol' points. That is, we remove the first three +columns of the `soboljk` format. The Magic Point Shop (Nuyens, 2020) also +produces files that contain only the direction numbers. One example of a file in this `sobol` format: @@ -211,11 +400,29 @@ One example of a file in this `sobol` format: 1 1 5 5 17 ``` -A list of the first few primitive polynomials in base 2 is given [here](https://mathworld.wolfram.com/PrimitivePolynomial.html). If we *do not* remove the first and last 1's in their representations, the first primitive polynomials are: `3, 7, 11, 13, 19, 25, ...`. Their degrees are `1, 2, 3, 3, 4, 4, ...`. This representation is the one used in the code of SSJ, for example. We can have a separate file that gives these polynomials, one per line, exactly as in the first three columns of the `soboljk` format. We may also want to remove the first column. - -Another, perhaps more convenient, way of storing Sobol' constructions is to just use the general `dnet` format, in which the generating matrices are given explicitly. This `dnet` format is easier to use. On the other hand, it requires specifying a (maximum) value of $k$, and $k$ integers per row to specify the generating matrices, which leads to larger files. From a file in `sobol` format, one can construct a digital net with an arbitrarily large $k$. - -When $n = 2^k$ is fixed, so we use exactly $n = 2^k$ points and there is there no embedding, we can add one *extra dimension* at the beginning by using the reflected identity as a generating matrix. The successive values for this coordinate will then be $0, 1/n, 2/n, 3/n, \dots$ in this order. This matrix will not be given in the file for Sobol' points; the QMC/RQMC software must handle it. For lattice rules and general digital nets with fixed $n$ (non-embedded), the file could give a first coordinate with this behavior. +A list of the first few primitive polynomials in base 2 is given +[here](https://mathworld.wolfram.com/PrimitivePolynomial.html). If we *do not* +remove the first and last 1's in their representations, the first primitive +polynomials are: `3, 7, 11, 13, 19, 25, ...`. Their degrees are `1, 2, 3, 3, 4, +4, ...`. This representation is the one used in the code of SSJ, for example. We +can have a separate file that gives these polynomials, one per line, exactly as +in the first three columns of the `soboljk` format. We may also want to remove +the first column. + +Another, perhaps more convenient, way of storing Sobol' constructions is to just +use the general `dnet` format, in which the generating matrices are given +explicitly. This `dnet` format is easier to use. On the other hand, it requires +specifying a (maximum) value of $k$, and $k$ integers per row to specify the +generating matrices, which leads to larger files. From a file in `sobol` format, +one can construct a digital net with an arbitrarily large $k$. + +When $n = 2^k$ is fixed, so we use exactly $n = 2^k$ points and there is there +no embedding, we can add one *extra dimension* at the beginning by using the +reflected identity as a generating matrix. The successive values for this +coordinate will then be $0, 1/n, 2/n, 3/n, \dots$ in this order. This matrix +will not be given in the file for Sobol' points; the QMC/RQMC software must +handle it. For lattice rules and general digital nets with fixed $n$ +(non-embedded), the file could give a first coordinate with this behavior. ## Files that contain randomizations @@ -234,7 +441,8 @@ A nested uniform scramble in base $b$. A (linear) left matrix scramble in base $b$. -For a `shiftmod1` in $s$ dimensions, the file will contain $s$ in the first line, followed by $s$ real numbers between 0 and 1, one per line. +For a `shiftmod1` in $s$ dimensions, the file will contain $s$ in the first +line, followed by $s$ real numbers between 0 and 1, one per line. ``` # shiftmod1 @@ -245,7 +453,14 @@ For a `shiftmod1` in $s$ dimensions, the file will contain $s$ in the first line 0.1530364040t106301 ``` -For a `dshift` with $r$ digits of accuracy in base $b$, in $s$ dimensions, the file will contain $b$ in the first line, $s$ in the second line, $r$ in the third line, and then $s$ integers from 0 to $b^r-1$, one per line. For the latter, the digits of the base $b$ representation of the integer divided by $b^r$ will be added modulo $b$ to the corresponding digits of the base $b$ representation of the coordinate. For example, if $b=2$ and $r=31$, the randomization makes a xor of the 31 bits of this integer with the 31 most significant bits of the corresponding coordinate of each point. +For a `dshift` with $r$ digits of accuracy in base $b$, in $s$ dimensions, the +file will contain $b$ in the first line, $s$ in the second line, $r$ in the +third line, and then $s$ integers from 0 to $b^r-1$, one per line. For the +latter, the digits of the base $b$ representation of the integer divided by +$b^r$ will be added modulo $b$ to the corresponding digits of the base $b$ +representation of the coordinate. For example, if $b=2$ and $r=31$, the +randomization makes a xor of the 31 bits of this integer with the 31 most +significant bits of the corresponding coordinate of each point. ``` # dshift @@ -258,9 +473,18 @@ For a `dshift` with $r$ digits of accuracy in base $b$, in $s$ dimensions, the f 963462828 ``` -For a `lmscramble` with $r$ digits of accuracy, for $b^k$ points in base $b$ in $s$ dimensions, we need to store $s$ lower-triangular invertible $r\times r$ matrices with entries in $\{0,\dots,b-1\}$. For $b=2$, each matrix must have only 1's on the diagonal and 0's above the diagonal. Each such matrix can be stored in one line of the file, in exactly the same format as the generating matrices in the `dnet` format, using one integer for each column. We want them in this format for the fast LMS implementation we have in SSJ, for example. The file will contain $b$ in the first non-comment line, $s$ in the second line, $r$ in the third line, -and then $s$ square lower-triangular and invertible $r\times r$ matrices, one per line, with each column represented as an integer as in the `dnet` format. Thus, each scrambling matrix is represented by $r$ integers on the same line. -Here is an example, +For a `lmscramble` with $r$ digits of accuracy, for $b^k$ points in base $b$ in +$s$ dimensions, we need to store $s$ lower-triangular invertible $r\times r$ +matrices with entries in $\{0,\dots,b-1\}$. For $b=2$, each matrix must have +only 1's on the diagonal and 0's above the diagonal. Each such matrix can be +stored in one line of the file, in exactly the same format as the generating +matrices in the `dnet` format, using one integer for each column. We want them +in this format for the fast LMS implementation we have in SSJ, for example. The +file will contain $b$ in the first non-comment line, $s$ in the second line, $r$ +in the third line, and then $s$ square lower-triangular and invertible $r\times +r$ matrices, one per line, with each column represented as an integer as in the +`dnet` format. Thus, each scrambling matrix is represented by $r$ integers on +the same line. Here is an example, ``` # lmscramble @@ -274,11 +498,32 @@ Here is an example, ... ``` -For a `nuscramble` of the first $r \ge k$ digits, for $n=b^k$ points in base $b$ in $s$ dimensions, with the implementation proposed in Section 3 of Friedel and Keller (2002) and used for $b=2$ in class `DigitalNetBase2` of SSJ, we need $sn$ blocks of $r$ random digits in base $b$. Each such block can be represented as an integer in the range $\{0,1,\dots,b^r-1\}$. For $b=2$, these are $r$ bit integers. We can store these integers one row per dimension, $n$ integers per row. This gives the following `nuscramble` file format. The first non-comment line contains the base $b$, the second line gives the number $s$ of dimensions, the third line gives the scramble resolution (the number of digits that are scrambled), and the following $s$ lines give the $sn$ integers used for the scrambling, $n$ integers per line. Note that this is the same amount of random numbers that we would need if we use plain Monte Carlo instead of RQMC[^4][^5]. - -[^4]: Another way of storing the NUS is as follows. For each coordinate $j$, each point can be identified by a $k$ bit integer, and the NUS maps each such $k$ to a $r$ bit integer that corresponds to the scrambled coordinate $j$ of this point. So we can simply store this map in an array of size $b^k$ whose entry $i$ contains the corresponding $r$ bit integer. Applying this NUS is then fast and straightforward. - -[^5]: Alternative implementations of NUS that use a hashing function in place of a RNG are proposed in Burley (2020) and Laine and Karras (2011). These methods might be faster and the is much less information to store to reproduce a given scramble, but the hashing function must be fixed, known, and reliable. This essentially amount to fixing the RNG and storing only its seed. +For a `nuscramble` of the first $r \ge k$ digits, for $n=b^k$ points in base $b$ +in $s$ dimensions, with the implementation proposed in Section 3 of Friedel and +Keller (2002) and used for $b=2$ in class `DigitalNetBase2` of SSJ, we need $sn$ +blocks of $r$ random digits in base $b$. Each such block can be represented as +an integer in the range $\{0,1,\dots,b^r-1\}$. For $b=2$, these are $r$ bit +integers. We can store these integers one row per dimension, $n$ integers per +row. This gives the following `nuscramble` file format. The first non-comment +line contains the base $b$, the second line gives the number $s$ of dimensions, +the third line gives the scramble resolution (the number of digits that are +scrambled), and the following $s$ lines give the $sn$ integers used for the +scrambling, $n$ integers per line. Note that this is the same amount of random +numbers that we would need if we use plain Monte Carlo instead of RQMC[^4][^5]. + +[^4]: Another way of storing the NUS is as follows. For each coordinate $j$, + each point can be identified by a $k$ bit integer, and the NUS maps each + such $k$ to a $r$ bit integer that corresponds to the scrambled coordinate + $j$ of this point. So we can simply store this map in an array of size $b^k$ + whose entry $i$ contains the corresponding $r$ bit integer. Applying this + NUS is then fast and straightforward. + +[^5]: Alternative implementations of NUS that use a hashing function in place of + a RNG are proposed in Burley (2020) and Laine and Karras (2011). These + methods might be faster and the is much less information to store to + reproduce a given scramble, but the hashing function must be fixed, known, + and reliable. This essentially amount to fixing the RNG and storing only its + seed. ``` # nuscramble @@ -295,21 +540,43 @@ For a `nuscramble` of the first $r \ge k$ digits, for $n=b^k$ points in base $b$ ## File names and other recommendations -It is strongly recommend that all file names start with the corresponding keyword, like `plattice` for a polynomial lattice rule, `sobol` for a Sobol point set, and `lmscramble` for a left matrix scramble, for example. +It is strongly recommend that all file names start with the corresponding +keyword, like `plattice` for a polynomial lattice rule, `sobol` for a Sobol +point set, and `lmscramble` for a left matrix scramble, for example. -It is also recommended to put enough relevant comments in each file for a knowledgeable human to find what the file is for (type of point set, figure of merit and weights that were used to construct it, range of values of $n$ for embedded point sets, etc.). +It is also recommended to put enough relevant comments in each file for a +knowledgeable human to find what the file is for (type of point set, figure of +merit and weights that were used to construct it, range of values of $n$ for +embedded point sets, etc.). -We also want some unit tests: some specific parameter files together with the correct output that should be observed when generating the points from these files. +We also want some unit tests: some specific parameter files together with the +correct output that should be observed when generating the points from these +files. ## References -- P. Bratley and B. L. Fox. Algorithm 659: Implementing Sobol’s quasirandom sequence generator. ACM Transactions on Mathematical Software, 14(1):88–100, 1988. -- Brent Burley. Practical hash-based Owen scrambling. The Journal of Computer Graphics Techniques, 9(4):1–20, 2020. -- I. Friedel and A. Keller. Fast generation of randomized low-discrepancy point sets. In K.-T. Fang, F. J. Hickernell, and H. Niederreiter, editors, Monte Carlo and Quasi-Monte Carlo Methods 2000, pages 257–273, Berlin, 2002. Springer-Verlag. -- T. Goda and J. Dick. Construction of interlaced scrambled polynomial lattice rules of arbitrary high order. Foundation of Computational Mathematics, 15:1245–1278, 2015. -- S. Joe and F. Y. Kuo. Constructing Sobol sequences with better two-dimensional projections. SIAM Journal on Scientific Computing, 30(5):2635–2654, 2008. -- S. Laine and T. Karras. Stratified sampling for stochastic transparency. Computer Graphics Forum, 30(4):1197–1204, 2011. -- P. L’Ecuyer. SSJ: Stochastic simulation in Java. http://simul.iro.umontreal.ca/ssj/, accessed 9th August 2021, 2016. -- P. L’Ecuyer, P. Marion, M. Godin, and F. Puchhammer. A tool for custom construction of QMC and RQMC point sets. In A. Keller, editor, Monte Carlo and Quasi-Monte Carlo Methods: MCQMC 2020, pages 51–70, Berlin, 2022. Springer. https://arxiv.org/abs/2012.10263. -- D. Nuyens. The magic point shop, 2020. https://people.cs.kuleuven.be/~dirk.nuyens/qmc-generators/. +- P. Bratley and B. L. Fox. Algorithm 659: Implementing Sobol’s quasirandom + sequence generator. ACM Transactions on Mathematical Software, 14(1):88–100, + 1988. +- Brent Burley. Practical hash-based Owen scrambling. The Journal of Computer + Graphics Techniques, 9(4):1–20, 2020. +- I. Friedel and A. Keller. Fast generation of randomized low-discrepancy point + sets. In K.-T. Fang, F. J. Hickernell, and H. Niederreiter, editors, Monte + Carlo and Quasi-Monte Carlo Methods 2000, pages 257–273, Berlin, 2002. + Springer-Verlag. +- T. Goda and J. Dick. Construction of interlaced scrambled polynomial lattice + rules of arbitrary high order. Foundation of Computational Mathematics, + 15:1245–1278, 2015. +- S. Joe and F. Y. Kuo. Constructing Sobol sequences with better two-dimensional + projections. SIAM Journal on Scientific Computing, 30(5):2635–2654, 2008. +- S. Laine and T. Karras. Stratified sampling for stochastic transparency. + Computer Graphics Forum, 30(4):1197–1204, 2011. +- P. L’Ecuyer. SSJ: Stochastic simulation in Java. + http://simul.iro.umontreal.ca/ssj/, accessed 9th August 2021, 2016. +- P. L’Ecuyer, P. Marion, M. Godin, and F. Puchhammer. A tool for custom + construction of QMC and RQMC point sets. In A. Keller, editor, Monte Carlo and + Quasi-Monte Carlo Methods: MCQMC 2020, pages 51–70, Berlin, 2022. Springer. + https://arxiv.org/abs/2012.10263. +- D. Nuyens. The magic point shop, 2020. + https://people.cs.kuleuven.be/~dirk.nuyens/qmc-generators/. From fbb0ded62292e3f7b60726348e25f884d7af1b45 Mon Sep 17 00:00:00 2001 From: sou-cheng-choi Date: Thu, 20 Nov 2025 08:07:46 -0600 Subject: [PATCH 03/15] Create LICENSE.txt --- LICENSE.txt | 13 +++++++++++++ 1 file changed, 13 insertions(+) create mode 100644 LICENSE.txt diff --git a/LICENSE.txt b/LICENSE.txt new file mode 100644 index 0000000..f67ae09 --- /dev/null +++ b/LICENSE.txt @@ -0,0 +1,13 @@ +Copyright [2021] [Illinois Institute of Technology] + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. From 5ab3909ebe82b6ca5b6096ab65a5a9bb107e0f17 Mon Sep 17 00:00:00 2001 From: sou-cheng-choi Date: Thu, 20 Nov 2025 08:08:36 -0600 Subject: [PATCH 04/15] Rename README.md to LD_DATA.md --- LD_DATA.md | 582 +++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 582 insertions(+) create mode 100644 LD_DATA.md diff --git a/LD_DATA.md b/LD_DATA.md new file mode 100644 index 0000000..4d0d6f9 --- /dev/null +++ b/LD_DATA.md @@ -0,0 +1,582 @@ +# Low Discrepancy Data + +Low discrepancy generating vectors and matrices. + +## Softwares + +- [LatNet Builder](https://github.com/umontreal-simul/latnetbuilder) +- [Magic Point Shop](https://people.cs.kuleuven.be/~dirk.nuyens/qmc-generators/) + +## General ideas and goals + +We propose *standard formats* to specify lattice rules, polynomial lattice +rules, and digital nets, in simple text files. We want the formats to be simple +and relatively compact, with no more than one line per dimension, so they can +easily be used for point sets in several thousand dimensions if desired. +Ordinary text files with decimal numbers are good enough. They are easy to read +by both humans and computers in any language. Other specialized formats (Json or +Parquet, for example) can be more compact but then the files are not as easy to +read without extra tools. + +Joe and Kuo (2008) provide a file for up to 21201 dimensions for Sobol' points, +and this seems to be the most widely used set of parameters for RQMC points at +this time. They use a fixed and very simple format, which requires no special +software to read. We want to provide similar types of files for other types of +point sets, for an arbitrarily large number of dimensions. The SSJ simulation +library (L’Ecuyer, 2016) can already read some of our proposed formats. Latnet +Builder (L’Ecuyer et al., 2022) can (or will) produce output files in these +formats. The choice of output format for Latnet Builder can be specified on the +command line, using the following keywords: + +- `lattice` +A lattice rule: give the modulus and the generating vector. +- `dnet` +A digital net: give the generating matrices, one per line. +- `plattice` +A polynomial lattice rule: give the polynomial modulus and the generating vector. +- `sobol` +Sobol' points (default format), give only the direction numbers. +- `soboljk` +Sobol' points, the format used by Joe and Kuo (2008). + + +The most important formats are the first two, since the point sets covered by +the other formats are special cases of digital nets, so they can all be +described by the `dnet` format. We propose them because they provide alternative +representations that are either more compact or commonly used. + +All the point sets that we consider have the form + +$$P_n = \{\boldsymbol{u}_i \in [0,1)^s,\; i=0,\dots,n-1\}$$ + +where $n$ is the number of points and $s$ is the number of dimensions. The +dimension $j$ goes from 1 to $s$ and there are $n$ points enumerated by $i$ +going from 0 to $n-1$. (In computer code, $j$ usually starts at 0, whereas in +math papers, it starts at 1; one must be careful about this discrepancy.) The +.txt files that contain the parameters have one line per dimension, preceded by +a few lines that contain general parameters, such as $s$, $n$, etc. We shall +call these lines the *header* of the file. In the header, additional lines that +start with `#` can be used for comments and descriptions; these lines are +totally optional and should be just skipped by the program that reads the file. +Anything that starts with `#` on any given line in the header should also be +skipped. All these comments are only for human readers to better see what is in +the file, they are not for the computer[^1]. One exception: the first line of +the file must be a comment that contains the keyword for the file type; for +example `dnet` for a digital net. The number of dimensions (number of lines +after the header) can be much larger than what we usually need; it suffices to +use the number of rows that are needed. + +[^1]: Comments are now allowed only in the header lines, + not in the $s$ lines that follow. This makes more sense. + +The point sets can be extensible in the number of points $n$ or not (they can be +constructed for a single $n$ only). Sobol points are extensible ad infinitum, +although they are very good only when $n$ is a power of 2. Other types of point +sets can also also be extensible, but are usually constructed to be good only +for $n = b^k$ for $k$ in a given integer range, e.g., from 10 to 20, where $b +\ge 2$ is the base. They satisfy the property that $P_{b^k}$ contains +$P_{b^{k-1}}$ for all $k$ in this range. We call them *embedded* point sets. For +these types of point sets it is highly recommended to specify the range in a +comment in the header of the file [^2]. + +[^2]: The range for which the points were built could be given in the file, but + this makes things a bit more complicated for some point sets. For example, + for ordinary lattice rules with a prime number of points, this additional + info might be confusing for some users. For Sobol points, the range has no + limit. + +In the proposed formats, the files do not assume a given computer word size +(e.g., 32 bits or 64 bits). The format is exactly the same regardless of the +word size. Of course, if the file contains integers of more than 32 bits, the +corresponding points cannot be generated properly on a 32 bit computer. A +comment in the file header can say it. + +Some users might prefer input files with no header at all, only the $s$ lines +that give the generating vector or generating matrices. In some languages +(e.g., MATLAB), such a file can be read into a matrix by a simple "load file" +command, so there is no need to to write any code to read the file. Users who +want that can simply strip out the header from the files in standard format and +use these naked files privately. We think that the header with human-readable +comments as imposed by the standard will be very useful to many users. + +The following sections describe the proposed text-file formats for the different +point sets. + + +## Types of point sets and notation + + +### Parameters for ordinary lattice rules: `lattice` + +For an ordinary *lattice rule of rank 1*, we have + +$$P_n = \{\boldsymbol{u}_i = (i \boldsymbol{a} \bmod n)/n,\; i=0,\dots,n-1\}$$ + +where $\boldsymbol{a} = (a_1,\dots,a_s)$ is the generating vector. We must +specify $s$, $n$, and $\boldsymbol{a}$. + +In a `lattice` file, the first line must start with `# lattice`. After that, not +counting the comment lines, the first line gives the number $s$ of dimensions, +the second line gives the number $n$ of points, and lines 3 to $s+2$ give the +coefficients $a_1,\dots, a_s$, one value per line. In the case of embedded +lattice rules, $n$ would usually be a power of 2, say $n=2^k$, and the smaller +embedded lattices will contain the first $2^{k-1}$ points, the first $2^{k-2}$ +points, etc. Additional comments in the file should tell when the lattice is +embedded, which figure of merit and what weights were used, the construction +method, etc. [^3] + +[^3]: Should we be forced to always put the embedding range in the file for the + computer to read? My suggestion is not to force the computer to read it, + but just put it as a comment for humans, Otherwise, it will force an + additional line that gives the base and the range. What would we put for + example if $n$ is prime and the rule is not embedded? Should we have a + `lattice2` format for embedded rules in base 2? Putting more options makes + things more complicated. + +One example of a parameter file for an ordinary lattice rule, in `lattice` +format is given below. In this file, the first line is skipped, only the number +`8` is read on the second line, only the number `65536` is read on the second +line, etc. + +``` +# lattice +# A lattice rule, non-embedded, in 'lattice' format +8 # 8 dimensions +65536 # modulus = n = 65536 points +# coordinates of the generating vector, starting at j=1: +1 +19463 +17213 +5895 +14865 +31925 +30921 +26671 +``` + +### Parameters for digital nets: `dnet` + +A *digital net in base* $b$ with $n=b^k$ points is defined by selecting integers +$s \geq 1$, $r \geq k \geq 1$, and $s$ matrices +$\boldsymbol{C}_1,\dots,\boldsymbol{C}_s$ of size $r \times k$ with entries in +$\mathbb{Z}_b$, called the generating matrices. For $i=0,\dots,n-1$, let $i = +\sum_{\ell=0}^{k-1} a_{i,\ell} b^\ell$ be the expansion of $i$ in base $b$, and +for $j=1,\dots s$, let + +$$(y_{i, j, 1}, \dots, y_{i, j, r})^T = \boldsymbol{C}_j \cdot (a_{i, 0}, \dots, a_{i, k-1})^T$$ + +and + +$$u_{i, j} = \sum_{\ell=1}^{r} y_{i, j, \ell} b^{-\ell}.$$ + +The points $\boldsymbol{u}_i$ are defined by $\boldsymbol{u}_i = +(u_{i,1},\dots,u_{i,s})$. Digital nets are usually in base $b=2$, but we allow a +general (typically prime) base $b \ge 2$. + +The proposed format to specify digital nets is as follows. The first line must +start with `# dnet`. Then the first four non-comment lines give $b$ (the base), +$s$ (the number of dimensions), $k$ (the number of columns), and $r$ (the number +of rows in the generating matrices in base $b$). Thus, the output values will +have "precision" $b^{-r}$ (they will be integer multiples of $b^{-r}$). For +$b=2$, a common value in the past has been $r=31$ when using 32 bit integers, +but going forward we should use 64 bit integers and $r=63$ or 64, or perhaps +$r=53$ to exploit the full accuracy of a `double`. By looking at $r$, one can +see right away whether this file is good for 64 bit computers only or for 32 bit +computers as well. + +The $s$ lines after this header will contain the $s$ generating matrices, one +per line. Each of these lines contains $k$ integers smaller than $b^r$ giving +the $k$ columns of $\boldsymbol{C}_j$, using by default the same encoding as in +the class `DigitalNetBase2` in SSJ for $b=2$. That is, the base $b$ +representation of the integer gives the $r$ digits in the corresponding column, +with the digit on the first row of the matrix (row 0) being the most +significant, and the one on the last row (row $r-1$) being the least +significant. For example, if $b=2$, $r=31$, and the first column has a 1 in the +first row and 0 in all other rows, as is always the case for Sobol points, then +the integer representation of this column will be $2^{30} = 1\,073\,741\,824$. +If there is a 1 in the last row and 0 elsewhere, the representation will be $2^0 += 1$. If all 31 elements of the column are 1, the representation will be +$2^{31}-1$. + +One example of a file for a digital net in `dnet` format: + +``` +# dnet +# A digital net in base 2, in 'dnet' format +2 # basis b = 2 +8 # s = 8 dimensions +10 # k = 10, so n = 2^10 = 1024 points +31 # r = 31 digits +# The columns of gen. matrices C_1, ..., C_s, one matrix per line: +1073741824 536870912 268435456 134217728 ... +2012537125 1382645254 ... +... +``` + +This differs from Joe and Kuo (2008), where the $c$-th column (for $c\ge 1$) is +represented by an integer smaller than $2^c$ (in base 2) and the least +significant bit is the one on the diagonal. Their representation works when +$\boldsymbol{C}_j$ is upper triangular, which is true for Sobol point sets, but +not for digital nets in general. + +Recall that coordinate $j$ of the $i$-th point is obtained by multiplying the +base $b$ matrix $\boldsymbol{C}_j$ by the vector of digits of the representation +of $i$ in base $b$, with the least significant digits of $i$ at the top. That +is, the least significant digit of $i$ goes with the first column of +$\boldsymbol{C}_j$. And the first row of $\boldsymbol{C}_j$ is for the most +significant digit of output. With our representation of $\boldsymbol{C}_j$ by +$k$ integers, the points are easy and fast to generate in base 2. We obtain +`u[i,j]`, coordinate $j$ of point $i$, with the following code snippet, in which +`C[j,c]` is the integer that represents column $c$ of $\boldsymbol{C}_j$: + +```python +normFactor = 1.0 / (1 << r) # 2^(-r) +coord = 0 +for c in range(k): + coord ^= ((i >> c) & 1) * C[j,c] + u[i,j] = coord * normFactor +``` + + +### Parameters for polynomial lattice rules: `plattice` + +*Polynomial lattice rules* are a special type of digital nets with generating +matrices of a special form. For a polynomial lattice rule of rank 1 in a prime +base $b$, we have + +$$P_n = \left\{\left(\varphi\left(\frac{h(z) a_1(z)}{Q(z)}\right), \dots, +\varphi\left(\frac{h(z) a_s(z)}{Q(z)}\right)\right) : h(z)\in\mathbb{F}_b[z], \ +\text{degree}(h(z)) < k\right\}.$$ + +where $\mathbb{F}_b[z]$ is the space of polynomials with coefficients in +$\mathbb{F}_b$, the *modulus* $Q(z) \in \mathbb{F}_b[z]$ is a polynomial of +degree $k$, the *generating vector* $\boldsymbol{a}(z) = +(a_1(z),\dots,a_s(z))\in \mathbb{F}_b[z]^s$ is a vector of $s$ polynomials of +degrees less than $k$, and the mapping $\varphi$ is defined by + +$$\varphi\left(\sum_{l = w}^{\infty} x_l z^{-l}\right) = \sum_{l = \max(w, +1)}^{\infty} x_l b^{-l}.$$ + +This point set has $n = b^k$ points. + +We must specify the polynomial modulus $Q(z)$ and the polynomial generating +vector $\boldsymbol{a}(z)$. The polynomial modulus will be represented as an +integer that has $(k+1)$ digits in base $b$, and all the other polynomials will +be represented as integers that have no more than $k$ digits in base $b$. All +these integers will be given in base 10 in the file, one per line. In practice, +we usually have $b=2$, so $k$ represents the number of bits. The integer that +represents a polynomial is obtained simply by replacing the formal variable by +$b$. For example, if the polynomial is $Q(z) = z^4 + z^3 + 1$ and $b=2$, its +coefficients are "1 1 0 0 1" and its integer representation is $2^4 + 2^3 + 1 = +25$. This is the usual representation, as used in Goda and Dick (2015), for +example. In the case of embedded point sets, the modulus should be $Q(z) = z^k$ +for $n=b^k$ points, and its integer representation is $b^k$. In particular, +$Q(z) = z$ is represented by the integer $b$. + +As usual, the first line is a comment that tells the type of file. Then the +first four non-comment lines give the base $b$, the number $s$ of dimensions, +the degree $k$ of the polynomial modulus, and the integer representation of this +polynomial. Lines 5 to $s+4$ give the polynomials that form the generating +vector, one per line, using the integer representation just explained. One +example of a file for a polynomial lattice in the `plattice` format: + +``` +# plattice +# A polynomial lattice rule in base 2, in 'plattice' format +2 # base b = 2 +8 # s = 8 dimensions +16 # n = 2^16 = 65536 points +45781 # polynomial modulus +# coordinates of the generating vector, starting at j=1: +1 +17213 +5895 +14865 +31925 +30921 +26671 +17213 +``` + +A polynomial lattice rule in base $b$ can also be represented as a digital net +in base $b$, so its parameters can also be provided in a file in the `dnet` +format, as for general digital net in base $b$. But the generating matrices have +a special form and the above representation is much more compact (a single +integer per row instead of $k$ integers per row). On the other hand, generating +the points is faster with the generating matrices than with the polynomial +representation, so the software that will use the `plattice` files and generate +the points would usually first convert the polynomials into the corresponding +generating matrices. LatNet Builder (L’Ecuyer et al., 2022) is also able to make +the conversion and produce a file in the `dnet` format, for more convenience and +better flexibility, so the user can select the format she/he prefers. + +### Parameters for Sobol nets: `sobol` and `soboljk` + +The Sobol' construction provides another special case of digital nets (and +sequences), in base 2. They are defined in many places, including Joe and Kuo +(2008). For each coordinate $j$, we select a primitive polynomial $p_j(z)$ of +degree $c_j$, and $c_j$ integers $m_{j,1},\dots,m_{j,c_j}$ which are used to +define the generating matrix $\boldsymbol{C}_j$. The real numbers $2^{-c} +m_{j,c}$ are called the initial *direction numbers*. More details are given in +Joe and Kuo (2008) and +[here](http://umontreal-simul.github.io/ssj/docs/master/classumontreal_1_1ssj_1_1hups_1_1SobolSequence.html). + +One obvious option for these point sets is to adopt exactly the same format as +Joe and Kuo (2008), because it is already used in many places. The only +difference is that we now allow comment lines in the file. In the format of Joe +and Kuo (2008), only the first line is skipped. In the proposed format, other +comment lines can be added at the beginning of the file, e.g., to give the +maximum number of dimensions in the file, the criterion and weights that were +used, etc. Note that Sobol' sequences have an infinite number of points and an +unlimited number of dimensions, although the file will give parameters for a +finite number of dimensions. + +The other lines of the file specify the primitive polynomials and the initial +direction numbers for each dimension $j \ge 2$, one line per dimension. For +dimension $j=1$, the generating matrix is the identity and is not given in the +file (it is implicit). The columns of this matrix are not obtained via a +recurrence based on a primitive polynomial, so this matrix is handled +separately. + +The first number on each line is the dimension $j$. The second number is the +degree $c_j$ of the primitive polynomial $p_j(x)$ used for this dimension. The +third number is the integer that corresponds to the binary representation of the +inner coefficients of this polynomial (we ignore the first and last +coefficients, they are always 1). For example, if the polynomial is $p_j(x) = +x^4 + x^3 + 1$, the coefficients are "1 1 0 0 1", and after removing the first +and last "1", we get 100 in base 2, which is 4, so the third column would +contain the number 4. (Without removing the first and last "1", the number would +be 25 instead.) After these three numbers, there are $c_j$ integers +$m_{j,1},\dots,m_{j,c_j}$ where $m_{j,c}$ is the $c^\text{th}$ (real-valued) +initial direction number for this coordinate, multiplied by $2^c$ to obtain an +integer. This $m_{j,c}$ is the integer formed by taking the bits in row 1 to row +$c$ of column $c$, in this order. The last bit is the bit on the diagonal, which +is always 1, so all $m_{j,c}$'s are odd integers. I think this format comes from +Bratley and Fox (1988). + +We denote this format for Sobol parameters by the `soboljk` keyword. One example +of a file in this format is shown below. The first line gives the type of file +and the next three lines are comments that must be skipped by the reading +program. + +``` +# soboljk +# Parameters for Sobol points, in 'soboljk' format +# 8 dimensions +# c_j p_j m_{j,c} +2 1 0 1 +3 2 1 1 3 +4 3 1 1 3 1 +5 3 2 1 1 1 +6 4 1 1 1 3 3 +7 4 4 1 3 5 13 +8 5 2 1 1 5 5 17 +``` + +The `soboljk` format can be simplified as follows. First, removing the first and +last "1" in the representation of the primitive polynomials saves a bit of +memory, but it also makes thinks slightly more complicated. In the default +representations of the primitive polynomials in the code that generates the +points, these bits are usually not removed. In SSJ, the first thing we do when +reading a file in `soboljk` format is to add them back. Also, the primitive +polynomials can be in a separate file, since they never change, and only the +(initial) direction numbers (those depend on the selected FOM and weights) would +be given to specify the Sobol' points. That is, we remove the first three +columns of the `soboljk` format. The Magic Point Shop (Nuyens, 2020) also +produces files that contain only the direction numbers. + +One example of a file in this `sobol` format: + +``` +# sobol +# Parameters m_{j,c} for Sobol points, in 'sobol' format +# 8 dimensions +1 # This is m_{j,c} for the second coordinate +1 3 +1 3 1 +1 1 1 +1 1 3 3 +1 3 5 13 +1 1 5 5 17 +``` + +A list of the first few primitive polynomials in base 2 is given +[here](https://mathworld.wolfram.com/PrimitivePolynomial.html). If we *do not* +remove the first and last 1's in their representations, the first primitive +polynomials are: `3, 7, 11, 13, 19, 25, ...`. Their degrees are `1, 2, 3, 3, 4, +4, ...`. This representation is the one used in the code of SSJ, for example. We +can have a separate file that gives these polynomials, one per line, exactly as +in the first three columns of the `soboljk` format. We may also want to remove +the first column. + +Another, perhaps more convenient, way of storing Sobol' constructions is to just +use the general `dnet` format, in which the generating matrices are given +explicitly. This `dnet` format is easier to use. On the other hand, it requires +specifying a (maximum) value of $k$, and $k$ integers per row to specify the +generating matrices, which leads to larger files. From a file in `sobol` format, +one can construct a digital net with an arbitrarily large $k$. + +When $n = 2^k$ is fixed, so we use exactly $n = 2^k$ points and there is there +no embedding, we can add one *extra dimension* at the beginning by using the +reflected identity as a generating matrix. The successive values for this +coordinate will then be $0, 1/n, 2/n, 3/n, \dots$ in this order. This matrix +will not be given in the file for Sobol' points; the QMC/RQMC software must +handle it. For lattice rules and general digital nets with fixed $n$ +(non-embedded), the file could give a first coordinate with this behavior. + +## Files that contain randomizations + +The idea of proposing a format for storing specific randomizations was suggested by Fred Hickernell. This can be useful for verification purposes, for example. + +We can store randomizations in the following file formats: + +- `shiftmod1` +A (random) shift modulo 1. It corresponds to a single point in $[0,1)^s$. +- `dshift` +A digital shift in base $b$. +Also a single point in $[0,1)^s$, but with $r$ digits in base $b$. +- `nuscramble` +A nested uniform scramble in base $b$. +- `lmscramble` +A (linear) left matrix scramble in base $b$. + + +For a `shiftmod1` in $s$ dimensions, the file will contain $s$ in the first +line, followed by $s$ real numbers between 0 and 1, one per line. + +``` +# shiftmod1 +# A shift modulo 1, in 'shiftmod1' format +3 # s = 3 dimensions +0.32638741823951621 +0.91325392536931693 +0.1530364040t106301 +``` + +For a `dshift` with $r$ digits of accuracy in base $b$, in $s$ dimensions, the +file will contain $b$ in the first line, $s$ in the second line, $r$ in the +third line, and then $s$ integers from 0 to $b^r-1$, one per line. For the +latter, the digits of the base $b$ representation of the integer divided by +$b^r$ will be added modulo $b$ to the corresponding digits of the base $b$ +representation of the coordinate. For example, if $b=2$ and $r=31$, the +randomization makes a xor of the 31 bits of this integer with the 31 most +significant bits of the corresponding coordinate of each point. + +``` +# dshift +# A digital shift in base 2, in 'dshift' format +2 # b = 2 +3 # s = 3 +31 # r = 31 +2146832861 +1084390381 +963462828 +``` + +For a `lmscramble` with $r$ digits of accuracy, for $b^k$ points in base $b$ in +$s$ dimensions, we need to store $s$ lower-triangular invertible $r\times r$ +matrices with entries in $\{0,\dots,b-1\}$. For $b=2$, each matrix must have +only 1's on the diagonal and 0's above the diagonal. Each such matrix can be +stored in one line of the file, in exactly the same format as the generating +matrices in the `dnet` format, using one integer for each column. We want them +in this format for the fast LMS implementation we have in SSJ, for example. The +file will contain $b$ in the first non-comment line, $s$ in the second line, $r$ +in the third line, and then $s$ square lower-triangular and invertible $r\times +r$ matrices, one per line, with each column represented as an integer as in the +`dnet` format. Thus, each scrambling matrix is represented by $r$ integers on +the same line. Here is an example, + +``` +# lmscramble +# A left matrix scramble in base 2, with 31 digits of resolution. +2 # basis b = 2 +8 # s = 8 dimensions +31 # r = 31 digits +# The columns of the lower-triangular r x r scrambling matrices, one matrix per line: +1673741824 906870912 615843556 213427728 ... +2012537125 1012645254 ... +... +``` + +For a `nuscramble` of the first $r \ge k$ digits, for $n=b^k$ points in base $b$ +in $s$ dimensions, with the implementation proposed in Section 3 of Friedel and +Keller (2002) and used for $b=2$ in class `DigitalNetBase2` of SSJ, we need $sn$ +blocks of $r$ random digits in base $b$. Each such block can be represented as +an integer in the range $\{0,1,\dots,b^r-1\}$. For $b=2$, these are $r$ bit +integers. We can store these integers one row per dimension, $n$ integers per +row. This gives the following `nuscramble` file format. The first non-comment +line contains the base $b$, the second line gives the number $s$ of dimensions, +the third line gives the scramble resolution (the number of digits that are +scrambled), and the following $s$ lines give the $sn$ integers used for the +scrambling, $n$ integers per line. Note that this is the same amount of random +numbers that we would need if we use plain Monte Carlo instead of RQMC[^4][^5]. + +[^4]: Another way of storing the NUS is as follows. For each coordinate $j$, + each point can be identified by a $k$ bit integer, and the NUS maps each + such $k$ to a $r$ bit integer that corresponds to the scrambled coordinate + $j$ of this point. So we can simply store this map in an array of size $b^k$ + whose entry $i$ contains the corresponding $r$ bit integer. Applying this + NUS is then fast and straightforward. + +[^5]: Alternative implementations of NUS that use a hashing function in place of + a RNG are proposed in Burley (2020) and Laine and Karras (2011). These + methods might be faster and the is much less information to store to + reproduce a given scramble, but the hashing function must be fixed, known, + and reliable. This essentially amount to fixing the RNG and storing only its + seed. + +``` +# nuscramble +# A nested uniform scramble in base 2, with 30 bits of resolution. +2 # basis b = 2 +8 # s = 8 dimensions +10 # k = 10, so n = 2^10 = 1024 points +30 # r = 30 digits +# The following s rows contain n = 1024 30 bit integers per row: +1173741824 906870912 615843556 213427728 ... +1012537125 1001975254 ... +... +``` + +## File names and other recommendations + +It is strongly recommend that all file names start with the corresponding +keyword, like `plattice` for a polynomial lattice rule, `sobol` for a Sobol +point set, and `lmscramble` for a left matrix scramble, for example. + +It is also recommended to put enough relevant comments in each file for a +knowledgeable human to find what the file is for (type of point set, figure of +merit and weights that were used to construct it, range of values of $n$ for +embedded point sets, etc.). + + +We also want some unit tests: some specific parameter files together with the +correct output that should be observed when generating the points from these +files. + +## References + +- P. Bratley and B. L. Fox. Algorithm 659: Implementing Sobol’s quasirandom + sequence generator. ACM Transactions on Mathematical Software, 14(1):88–100, + 1988. +- Brent Burley. Practical hash-based Owen scrambling. The Journal of Computer + Graphics Techniques, 9(4):1–20, 2020. +- I. Friedel and A. Keller. Fast generation of randomized low-discrepancy point + sets. In K.-T. Fang, F. J. Hickernell, and H. Niederreiter, editors, Monte + Carlo and Quasi-Monte Carlo Methods 2000, pages 257–273, Berlin, 2002. + Springer-Verlag. +- T. Goda and J. Dick. Construction of interlaced scrambled polynomial lattice + rules of arbitrary high order. Foundation of Computational Mathematics, + 15:1245–1278, 2015. +- S. Joe and F. Y. Kuo. Constructing Sobol sequences with better two-dimensional + projections. SIAM Journal on Scientific Computing, 30(5):2635–2654, 2008. +- S. Laine and T. Karras. Stratified sampling for stochastic transparency. + Computer Graphics Forum, 30(4):1197–1204, 2011. +- P. L’Ecuyer. SSJ: Stochastic simulation in Java. + http://simul.iro.umontreal.ca/ssj/, accessed 9th August 2021, 2016. +- P. L’Ecuyer, P. Marion, M. Godin, and F. Puchhammer. A tool for custom + construction of QMC and RQMC point sets. In A. Keller, editor, Monte Carlo and + Quasi-Monte Carlo Methods: MCQMC 2020, pages 51–70, Berlin, 2022. Springer. + https://arxiv.org/abs/2012.10263. +- D. Nuyens. The magic point shop, 2020. + https://people.cs.kuleuven.be/~dirk.nuyens/qmc-generators/. From 55889619e525ed1075eae8d8170d23029b681d99 Mon Sep 17 00:00:00 2001 From: sou-cheng-choi Date: Thu, 20 Nov 2025 08:17:24 -0600 Subject: [PATCH 05/15] -line breaks --- LD_DATA.md | 433 ++++++++++------------------------------------------- 1 file changed, 83 insertions(+), 350 deletions(-) diff --git a/LD_DATA.md b/LD_DATA.md index 4d0d6f9..da08e50 100644 --- a/LD_DATA.md +++ b/LD_DATA.md @@ -9,24 +9,11 @@ Low discrepancy generating vectors and matrices. ## General ideas and goals -We propose *standard formats* to specify lattice rules, polynomial lattice -rules, and digital nets, in simple text files. We want the formats to be simple -and relatively compact, with no more than one line per dimension, so they can -easily be used for point sets in several thousand dimensions if desired. -Ordinary text files with decimal numbers are good enough. They are easy to read -by both humans and computers in any language. Other specialized formats (Json or -Parquet, for example) can be more compact but then the files are not as easy to -read without extra tools. - -Joe and Kuo (2008) provide a file for up to 21201 dimensions for Sobol' points, -and this seems to be the most widely used set of parameters for RQMC points at -this time. They use a fixed and very simple format, which requires no special -software to read. We want to provide similar types of files for other types of -point sets, for an arbitrarily large number of dimensions. The SSJ simulation -library (L’Ecuyer, 2016) can already read some of our proposed formats. Latnet -Builder (L’Ecuyer et al., 2022) can (or will) produce output files in these -formats. The choice of output format for Latnet Builder can be specified on the -command line, using the following keywords: +We propose *standard formats* to specify lattice rules, polynomial lattice rules, and digital nets, in simple text files. We want the formats to be simple and relatively compact, with no more than one line per dimension, +so they can easily be used for point sets in several thousand dimensions if desired. Ordinary text files with decimal numbers are good enough. They are easy to read by both humans and computers in any language. Other specialized formats (Json or Parquet, for example) can be more compact but then +the files are not as easy to read without extra tools. + +Joe and Kuo (2008) provide a file for up to 21201 dimensions for Sobol' points, and this seems to be the most widely used set of parameters for RQMC points at this time. They use a fixed and very simple format, which requires no special software to read. We want to provide similar types of files for other types of point sets, for an arbitrarily large number of dimensions. The SSJ simulation library (L’Ecuyer, 2016) can already read some of our proposed formats. Latnet Builder (L’Ecuyer et al., 2022) can (or will) produce output files in these formats. The choice of output format for Latnet Builder can be specified on the command line, using the following keywords: - `lattice` A lattice rule: give the modulus and the generating vector. @@ -40,67 +27,26 @@ Sobol' points (default format), give only the direction numbers. Sobol' points, the format used by Joe and Kuo (2008). -The most important formats are the first two, since the point sets covered by -the other formats are special cases of digital nets, so they can all be -described by the `dnet` format. We propose them because they provide alternative -representations that are either more compact or commonly used. +The most important formats are the first two, since the point sets covered by the other formats are special cases of digital nets, so they can all be described by the `dnet` format. We propose them because they provide alternative representations that are either more compact or commonly used. All the point sets that we consider have the form $$P_n = \{\boldsymbol{u}_i \in [0,1)^s,\; i=0,\dots,n-1\}$$ -where $n$ is the number of points and $s$ is the number of dimensions. The -dimension $j$ goes from 1 to $s$ and there are $n$ points enumerated by $i$ -going from 0 to $n-1$. (In computer code, $j$ usually starts at 0, whereas in -math papers, it starts at 1; one must be careful about this discrepancy.) The -.txt files that contain the parameters have one line per dimension, preceded by -a few lines that contain general parameters, such as $s$, $n$, etc. We shall -call these lines the *header* of the file. In the header, additional lines that -start with `#` can be used for comments and descriptions; these lines are -totally optional and should be just skipped by the program that reads the file. -Anything that starts with `#` on any given line in the header should also be -skipped. All these comments are only for human readers to better see what is in -the file, they are not for the computer[^1]. One exception: the first line of -the file must be a comment that contains the keyword for the file type; for -example `dnet` for a digital net. The number of dimensions (number of lines -after the header) can be much larger than what we usually need; it suffices to -use the number of rows that are needed. +where $n$ is the number of points and $s$ is the number of dimensions. The dimension $j$ goes from 1 to $s$ and there are $n$ points enumerated by $i$ going from 0 to $n-1$. (In computer code, $j$ usually starts at 0, whereas in math papers, it starts at 1; one must be careful about this discrepancy.) The .txt files that contain the parameters have one line per dimension, preceded by a few lines that contain general parameters, such as $s$, $n$, etc. We shall call these lines the *header* of the file. In the header, additional lines that start with `#` can be used for comments and descriptions; these lines are totally optional and should be just skipped by the program that reads the file. Anything that starts with `#` on any given line in the header should also be skipped. All these comments are only for human readers to better see what is in the file, they are not for the computer[^1]. One exception: the first line of the file must be a comment that contains the keyword for the file type; for example `dnet` for a digital net. The number of dimensions (number of lines after the header) can be much larger than what we usually need; it suffices to use the number of rows that are needed. [^1]: Comments are now allowed only in the header lines, not in the $s$ lines that follow. This makes more sense. -The point sets can be extensible in the number of points $n$ or not (they can be -constructed for a single $n$ only). Sobol points are extensible ad infinitum, -although they are very good only when $n$ is a power of 2. Other types of point -sets can also also be extensible, but are usually constructed to be good only -for $n = b^k$ for $k$ in a given integer range, e.g., from 10 to 20, where $b -\ge 2$ is the base. They satisfy the property that $P_{b^k}$ contains -$P_{b^{k-1}}$ for all $k$ in this range. We call them *embedded* point sets. For -these types of point sets it is highly recommended to specify the range in a -comment in the header of the file [^2]. - -[^2]: The range for which the points were built could be given in the file, but - this makes things a bit more complicated for some point sets. For example, - for ordinary lattice rules with a prime number of points, this additional - info might be confusing for some users. For Sobol points, the range has no - limit. - -In the proposed formats, the files do not assume a given computer word size -(e.g., 32 bits or 64 bits). The format is exactly the same regardless of the -word size. Of course, if the file contains integers of more than 32 bits, the -corresponding points cannot be generated properly on a 32 bit computer. A -comment in the file header can say it. - -Some users might prefer input files with no header at all, only the $s$ lines -that give the generating vector or generating matrices. In some languages -(e.g., MATLAB), such a file can be read into a matrix by a simple "load file" -command, so there is no need to to write any code to read the file. Users who -want that can simply strip out the header from the files in standard format and -use these naked files privately. We think that the header with human-readable -comments as imposed by the standard will be very useful to many users. - -The following sections describe the proposed text-file formats for the different -point sets. +The point sets can be extensible in the number of points $n$ or not (they can be constructed for a single $n$ only). Sobol points are extensible ad infinitum, although they are very good only when $n$ is a power of 2. Other types of point sets can also also be extensible, but are usually constructed to be good only for $n = b^k$ for $k$ in a given integer range, e.g., from 10 to 20, where $b \ge 2$ is the base. They satisfy the property that $P_{b^k}$ contains $P_{b^{k-1}}$ for all $k$ in this range. We call them *embedded* point sets. For these types of point sets it is highly recommended to specify the range in a comment in the header of the file [^2]. + +[^2]: The range for which the points were built could be given in the file, but this makes things a bit more complicated for some point sets. For example, for ordinary lattice rules with a prime number of points, this additional info might be confusing for some users. For Sobol points, the range has no limit. + +In the proposed formats, the files do not assume a given computer word size (e.g., 32 bits or 64 bits). The format is exactly the same regardless of the word size. Of course, if the file contains integers of more than 32 bits, the corresponding points cannot be generated properly on a 32 bit computer. A comment in the file header can say it. + +Some users might prefer input files with no header at all, only the $s$ lines that give the generating vector or generating matrices. In some languages (e.g., MATLAB), such a file can be read into a matrix by a simple "load file" command, so there is no need to to write any code to read the file. Users who want that can simply strip out the header from the files in standard format and use these naked files privately. We think that the header with human-readable comments as imposed by the standard will be very useful to many users. + +The following sections describe the proposed text-file formats for the different point sets. ## Types of point sets and notation @@ -112,31 +58,14 @@ For an ordinary *lattice rule of rank 1*, we have $$P_n = \{\boldsymbol{u}_i = (i \boldsymbol{a} \bmod n)/n,\; i=0,\dots,n-1\}$$ -where $\boldsymbol{a} = (a_1,\dots,a_s)$ is the generating vector. We must -specify $s$, $n$, and $\boldsymbol{a}$. - -In a `lattice` file, the first line must start with `# lattice`. After that, not -counting the comment lines, the first line gives the number $s$ of dimensions, -the second line gives the number $n$ of points, and lines 3 to $s+2$ give the -coefficients $a_1,\dots, a_s$, one value per line. In the case of embedded -lattice rules, $n$ would usually be a power of 2, say $n=2^k$, and the smaller -embedded lattices will contain the first $2^{k-1}$ points, the first $2^{k-2}$ -points, etc. Additional comments in the file should tell when the lattice is -embedded, which figure of merit and what weights were used, the construction -method, etc. [^3] - -[^3]: Should we be forced to always put the embedding range in the file for the - computer to read? My suggestion is not to force the computer to read it, - but just put it as a comment for humans, Otherwise, it will force an - additional line that gives the base and the range. What would we put for - example if $n$ is prime and the rule is not embedded? Should we have a - `lattice2` format for embedded rules in base 2? Putting more options makes - things more complicated. - -One example of a parameter file for an ordinary lattice rule, in `lattice` -format is given below. In this file, the first line is skipped, only the number -`8` is read on the second line, only the number `65536` is read on the second -line, etc. +where $\boldsymbol{a} = (a_1,\dots,a_s)$ is the generating vector. We must specify $s$, $n$, and $\boldsymbol{a}$. + +In a `lattice` file, the first line must start with `# lattice`. After that, not counting the comment lines, the first line gives the number $s$ of dimensions, the second line gives the number $n$ of points, and lines 3 to $s+2$ give the coefficients $a_1,\dots, a_s$, one value per line. In the case of embedded lattice rules, $n$ would usually be a power of 2, say $n=2^k$, and the smaller embedded lattices will contain the first $2^{k-1}$ points, the first $2^{k-2}$ points, etc. +Additional comments in the file should tell when the lattice is embedded, which figure of merit and what weights were used, the construction method, etc. [^3] + +[^3]: Should we be forced to always put the embedding range in the file for the computer to read? My suggestion is not to force the computer to read it, but just put it as a comment for humans, Otherwise, it will force an additional line that gives the base and the range. What would we put for example if $n$ is prime and the rule is not embedded? Should we have a `lattice2` format for embedded rules in base 2? Putting more options makes things more complicated. + +One example of a parameter file for an ordinary lattice rule, in `lattice` format is given below. In this file, the first line is skipped, only the number `8` is read on the second line, only the number `65536` is read on the second line, etc. ``` # lattice @@ -156,12 +85,7 @@ line, etc. ### Parameters for digital nets: `dnet` -A *digital net in base* $b$ with $n=b^k$ points is defined by selecting integers -$s \geq 1$, $r \geq k \geq 1$, and $s$ matrices -$\boldsymbol{C}_1,\dots,\boldsymbol{C}_s$ of size $r \times k$ with entries in -$\mathbb{Z}_b$, called the generating matrices. For $i=0,\dots,n-1$, let $i = -\sum_{\ell=0}^{k-1} a_{i,\ell} b^\ell$ be the expansion of $i$ in base $b$, and -for $j=1,\dots s$, let +A *digital net in base* $b$ with $n=b^k$ points is defined by selecting integers $s \geq 1$, $r \geq k \geq 1$, and $s$ matrices $\boldsymbol{C}_1,\dots,\boldsymbol{C}_s$ of size $r \times k$ with entries in $\mathbb{Z}_b$, called the generating matrices. For $i=0,\dots,n-1$, let $i = \sum_{\ell=0}^{k-1} a_{i,\ell} b^\ell$ be the expansion of $i$ in base $b$, and for $j=1,\dots s$, let $$(y_{i, j, 1}, \dots, y_{i, j, r})^T = \boldsymbol{C}_j \cdot (a_{i, 0}, \dots, a_{i, k-1})^T$$ @@ -169,34 +93,12 @@ and $$u_{i, j} = \sum_{\ell=1}^{r} y_{i, j, \ell} b^{-\ell}.$$ -The points $\boldsymbol{u}_i$ are defined by $\boldsymbol{u}_i = -(u_{i,1},\dots,u_{i,s})$. Digital nets are usually in base $b=2$, but we allow a -general (typically prime) base $b \ge 2$. - -The proposed format to specify digital nets is as follows. The first line must -start with `# dnet`. Then the first four non-comment lines give $b$ (the base), -$s$ (the number of dimensions), $k$ (the number of columns), and $r$ (the number -of rows in the generating matrices in base $b$). Thus, the output values will -have "precision" $b^{-r}$ (they will be integer multiples of $b^{-r}$). For -$b=2$, a common value in the past has been $r=31$ when using 32 bit integers, -but going forward we should use 64 bit integers and $r=63$ or 64, or perhaps -$r=53$ to exploit the full accuracy of a `double`. By looking at $r$, one can -see right away whether this file is good for 64 bit computers only or for 32 bit -computers as well. - -The $s$ lines after this header will contain the $s$ generating matrices, one -per line. Each of these lines contains $k$ integers smaller than $b^r$ giving -the $k$ columns of $\boldsymbol{C}_j$, using by default the same encoding as in -the class `DigitalNetBase2` in SSJ for $b=2$. That is, the base $b$ -representation of the integer gives the $r$ digits in the corresponding column, -with the digit on the first row of the matrix (row 0) being the most -significant, and the one on the last row (row $r-1$) being the least -significant. For example, if $b=2$, $r=31$, and the first column has a 1 in the -first row and 0 in all other rows, as is always the case for Sobol points, then -the integer representation of this column will be $2^{30} = 1\,073\,741\,824$. -If there is a 1 in the last row and 0 elsewhere, the representation will be $2^0 -= 1$. If all 31 elements of the column are 1, the representation will be -$2^{31}-1$. +The points $\boldsymbol{u}_i$ are defined by $\boldsymbol{u}_i = (u_{i,1},\dots,u_{i,s})$. Digital nets are usually in base $b=2$, but we allow a general (typically prime) base $b \ge 2$. + +The proposed format to specify digital nets is as follows. The first line must start with `# dnet`. Then the first four non-comment lines give $b$ (the base), $s$ (the number of dimensions), $k$ (the number of columns), and $r$ (the number of rows in the generating matrices in base $b$). Thus, the output values will have "precision" $b^{-r}$ (they will be integer multiples of $b^{-r}$). For $b=2$, a common value in the past has been $r=31$ when using 32 bit integers, but going forward we should use 64 bit integers and $r=63$ or 64, or perhaps $r=53$ to exploit the full accuracy of a `double`. By looking at $r$, one can see right away whether this file is good for 64 bit computers only or for 32 bit computers as well. + +The $s$ lines after this header will contain the $s$ generating matrices, one per line. Each of these lines contains $k$ integers smaller than $b^r$ giving the $k$ columns of $\boldsymbol{C}_j$, using by default the same encoding as in the class `DigitalNetBase2` in SSJ for $b=2$. That is, the base $b$ representation of the integer gives the $r$ digits in the corresponding column, with the digit on the first row of the matrix (row 0) being the most significant, and the one on the last row (row $r-1$) being the least significant. For example, if $b=2$, $r=31$, and the first column has a 1 in the first row and 0 in all other rows, +as is always the case for Sobol points, then the integer representation of this column will be $2^{30} = 1\,073\,741\,824$. If there is a 1 in the last row and 0 elsewhere, the representation will be $2^0 = 1$. If all 31 elements of the column are 1, the representation will be $2^{31}-1$. One example of a file for a digital net in `dnet` format: @@ -213,21 +115,9 @@ One example of a file for a digital net in `dnet` format: ... ``` -This differs from Joe and Kuo (2008), where the $c$-th column (for $c\ge 1$) is -represented by an integer smaller than $2^c$ (in base 2) and the least -significant bit is the one on the diagonal. Their representation works when -$\boldsymbol{C}_j$ is upper triangular, which is true for Sobol point sets, but -not for digital nets in general. - -Recall that coordinate $j$ of the $i$-th point is obtained by multiplying the -base $b$ matrix $\boldsymbol{C}_j$ by the vector of digits of the representation -of $i$ in base $b$, with the least significant digits of $i$ at the top. That -is, the least significant digit of $i$ goes with the first column of -$\boldsymbol{C}_j$. And the first row of $\boldsymbol{C}_j$ is for the most -significant digit of output. With our representation of $\boldsymbol{C}_j$ by -$k$ integers, the points are easy and fast to generate in base 2. We obtain -`u[i,j]`, coordinate $j$ of point $i$, with the following code snippet, in which -`C[j,c]` is the integer that represents column $c$ of $\boldsymbol{C}_j$: +This differs from Joe and Kuo (2008), where the $c$-th column (for $c\ge 1$) is represented by an integer smaller than $2^c$ (in base 2) and the least significant bit is the one on the diagonal. Their representation works when $\boldsymbol{C}_j$ is upper triangular, which is true for Sobol point sets, but not for digital nets in general. + +Recall that coordinate $j$ of the $i$-th point is obtained by multiplying the base $b$ matrix $\boldsymbol{C}_j$ by the vector of digits of the representation of $i$ in base $b$, with the least significant digits of $i$ at the top. That is, the least significant digit of $i$ goes with the first column of $\boldsymbol{C}_j$. And the first row of $\boldsymbol{C}_j$ is for the most significant digit of output. With our representation of $\boldsymbol{C}_j$ by $k$ integers, the points are easy and fast to generate in base 2. We obtain `u[i,j]`, coordinate $j$ of point $i$, with the following code snippet, in which `C[j,c]` is the integer that represents column $c$ of $\boldsymbol{C}_j$: ```python normFactor = 1.0 / (1 << r) # 2^(-r) @@ -240,45 +130,19 @@ for c in range(k): ### Parameters for polynomial lattice rules: `plattice` -*Polynomial lattice rules* are a special type of digital nets with generating -matrices of a special form. For a polynomial lattice rule of rank 1 in a prime -base $b$, we have +*Polynomial lattice rules* are a special type of digital nets with generating matrices of a special form. For a polynomial lattice rule of rank 1 in a prime base $b$, we have -$$P_n = \left\{\left(\varphi\left(\frac{h(z) a_1(z)}{Q(z)}\right), \dots, -\varphi\left(\frac{h(z) a_s(z)}{Q(z)}\right)\right) : h(z)\in\mathbb{F}_b[z], \ -\text{degree}(h(z)) < k\right\}.$$ +$$P_n = \left\{\left(\varphi\left(\frac{h(z) a_1(z)}{Q(z)}\right), \dots, \varphi\left(\frac{h(z) a_s(z)}{Q(z)}\right)\right) : h(z)\in\mathbb{F}_b[z], \ \text{degree}(h(z)) < k\right\}.$$ -where $\mathbb{F}_b[z]$ is the space of polynomials with coefficients in -$\mathbb{F}_b$, the *modulus* $Q(z) \in \mathbb{F}_b[z]$ is a polynomial of -degree $k$, the *generating vector* $\boldsymbol{a}(z) = -(a_1(z),\dots,a_s(z))\in \mathbb{F}_b[z]^s$ is a vector of $s$ polynomials of -degrees less than $k$, and the mapping $\varphi$ is defined by +where $\mathbb{F}_b[z]$ is the space of polynomials with coefficients in $\mathbb{F}_b$, the *modulus* $Q(z) \in \mathbb{F}_b[z]$ is a polynomial of degree $k$, the *generating vector* $\boldsymbol{a}(z) = (a_1(z),\dots,a_s(z))\in \mathbb{F}_b[z]^s$ is a vector of $s$ polynomials of degrees less than $k$, and the mapping $\varphi$ is defined by -$$\varphi\left(\sum_{l = w}^{\infty} x_l z^{-l}\right) = \sum_{l = \max(w, -1)}^{\infty} x_l b^{-l}.$$ +$$\varphi\left(\sum_{l = w}^{\infty} x_l z^{-l}\right) = \sum_{l = \max(w, 1)}^{\infty} x_l b^{-l}.$$ This point set has $n = b^k$ points. -We must specify the polynomial modulus $Q(z)$ and the polynomial generating -vector $\boldsymbol{a}(z)$. The polynomial modulus will be represented as an -integer that has $(k+1)$ digits in base $b$, and all the other polynomials will -be represented as integers that have no more than $k$ digits in base $b$. All -these integers will be given in base 10 in the file, one per line. In practice, -we usually have $b=2$, so $k$ represents the number of bits. The integer that -represents a polynomial is obtained simply by replacing the formal variable by -$b$. For example, if the polynomial is $Q(z) = z^4 + z^3 + 1$ and $b=2$, its -coefficients are "1 1 0 0 1" and its integer representation is $2^4 + 2^3 + 1 = -25$. This is the usual representation, as used in Goda and Dick (2015), for -example. In the case of embedded point sets, the modulus should be $Q(z) = z^k$ -for $n=b^k$ points, and its integer representation is $b^k$. In particular, -$Q(z) = z$ is represented by the integer $b$. - -As usual, the first line is a comment that tells the type of file. Then the -first four non-comment lines give the base $b$, the number $s$ of dimensions, -the degree $k$ of the polynomial modulus, and the integer representation of this -polynomial. Lines 5 to $s+4$ give the polynomials that form the generating -vector, one per line, using the integer representation just explained. One -example of a file for a polynomial lattice in the `plattice` format: +We must specify the polynomial modulus $Q(z)$ and the polynomial generating vector $\boldsymbol{a}(z)$. The polynomial modulus will be represented as an integer that has $(k+1)$ digits in base $b$, and all the other polynomials will be represented as integers that have no more than $k$ digits in base $b$. All these integers will be given in base 10 in the file, one per line. In practice, we usually have $b=2$, so $k$ represents the number of bits. The integer that represents a polynomial is obtained simply by replacing the formal variable by $b$. For example, if the polynomial is $Q(z) = z^4 + z^3 + 1$ and $b=2$, its coefficients are "1 1 0 0 1" and its integer representation is $2^4 + 2^3 + 1 = 25$. This is the usual representation, as used in Goda and Dick (2015), for example. In the case of embedded point sets, the modulus should be $Q(z) = z^k$ for $n=b^k$ points, and its integer representation is $b^k$. In particular, $Q(z) = z$ is represented by the integer $b$. + +As usual, the first line is a comment that tells the type of file. Then the first four non-comment lines give the base $b$, the number $s$ of dimensions, the degree $k$ of the polynomial modulus, and the integer representation of this polynomial. Lines 5 to $s+4$ give the polynomials that form the generating vector, one per line, using the integer representation just explained. One example of a file for a polynomial lattice in the `plattice` format: ``` # plattice @@ -298,66 +162,23 @@ example of a file for a polynomial lattice in the `plattice` format: 17213 ``` -A polynomial lattice rule in base $b$ can also be represented as a digital net -in base $b$, so its parameters can also be provided in a file in the `dnet` -format, as for general digital net in base $b$. But the generating matrices have -a special form and the above representation is much more compact (a single -integer per row instead of $k$ integers per row). On the other hand, generating -the points is faster with the generating matrices than with the polynomial -representation, so the software that will use the `plattice` files and generate -the points would usually first convert the polynomials into the corresponding -generating matrices. LatNet Builder (L’Ecuyer et al., 2022) is also able to make -the conversion and produce a file in the `dnet` format, for more convenience and -better flexibility, so the user can select the format she/he prefers. +A polynomial lattice rule in base $b$ can also be represented as a digital net in base $b$, so its parameters can also be provided in a file in the `dnet` format, as for general digital net in base $b$. But the generating matrices have a special form and the above representation is much more compact (a single integer per row instead of $k$ integers per row). On the other hand, generating the points is faster with the generating matrices than with the polynomial representation, so the software that will use the `plattice` files and generate the points would usually first convert the polynomials into the corresponding generating matrices. LatNet Builder (L’Ecuyer et al., 2022) is also able to make the conversion and produce a file in the `dnet` format, for more convenience and better flexibility, so the user can select the format she/he prefers. ### Parameters for Sobol nets: `sobol` and `soboljk` -The Sobol' construction provides another special case of digital nets (and -sequences), in base 2. They are defined in many places, including Joe and Kuo -(2008). For each coordinate $j$, we select a primitive polynomial $p_j(z)$ of -degree $c_j$, and $c_j$ integers $m_{j,1},\dots,m_{j,c_j}$ which are used to -define the generating matrix $\boldsymbol{C}_j$. The real numbers $2^{-c} -m_{j,c}$ are called the initial *direction numbers*. More details are given in -Joe and Kuo (2008) and -[here](http://umontreal-simul.github.io/ssj/docs/master/classumontreal_1_1ssj_1_1hups_1_1SobolSequence.html). - -One obvious option for these point sets is to adopt exactly the same format as -Joe and Kuo (2008), because it is already used in many places. The only -difference is that we now allow comment lines in the file. In the format of Joe -and Kuo (2008), only the first line is skipped. In the proposed format, other -comment lines can be added at the beginning of the file, e.g., to give the -maximum number of dimensions in the file, the criterion and weights that were -used, etc. Note that Sobol' sequences have an infinite number of points and an -unlimited number of dimensions, although the file will give parameters for a -finite number of dimensions. - -The other lines of the file specify the primitive polynomials and the initial -direction numbers for each dimension $j \ge 2$, one line per dimension. For -dimension $j=1$, the generating matrix is the identity and is not given in the -file (it is implicit). The columns of this matrix are not obtained via a -recurrence based on a primitive polynomial, so this matrix is handled -separately. - -The first number on each line is the dimension $j$. The second number is the -degree $c_j$ of the primitive polynomial $p_j(x)$ used for this dimension. The -third number is the integer that corresponds to the binary representation of the -inner coefficients of this polynomial (we ignore the first and last -coefficients, they are always 1). For example, if the polynomial is $p_j(x) = -x^4 + x^3 + 1$, the coefficients are "1 1 0 0 1", and after removing the first -and last "1", we get 100 in base 2, which is 4, so the third column would -contain the number 4. (Without removing the first and last "1", the number would -be 25 instead.) After these three numbers, there are $c_j$ integers -$m_{j,1},\dots,m_{j,c_j}$ where $m_{j,c}$ is the $c^\text{th}$ (real-valued) -initial direction number for this coordinate, multiplied by $2^c$ to obtain an -integer. This $m_{j,c}$ is the integer formed by taking the bits in row 1 to row -$c$ of column $c$, in this order. The last bit is the bit on the diagonal, which -is always 1, so all $m_{j,c}$'s are odd integers. I think this format comes from -Bratley and Fox (1988). - -We denote this format for Sobol parameters by the `soboljk` keyword. One example -of a file in this format is shown below. The first line gives the type of file -and the next three lines are comments that must be skipped by the reading -program. +The Sobol' construction provides another special case of digital nets (and sequences), in base 2. They are defined in many places, including Joe and Kuo (2008). For each coordinate $j$, we select a primitive polynomial $p_j(z)$ of degree $c_j$, and $c_j$ integers $m_{j,1},\dots,m_{j,c_j}$ which are used to define the generating matrix $\boldsymbol{C}_j$. The real numbers $2^{-c} m_{j,c}$ are called the +initial *direction numbers*. More details are given in Joe and Kuo (2008) and [here](http://umontreal-simul.github.io/ssj/docs/master/classumontreal_1_1ssj_1_1hups_1_1SobolSequence.html). + +One obvious option for these point sets is to adopt exactly the same format as Joe and Kuo (2008), because it is already used in many places. The only difference is that we now allow comment lines in the file. In the format of Joe and Kuo (2008), only the first line is skipped. In the proposed format, other comment lines can be added at the beginning of the file, e.g., to give the maximum number of dimensions in the file, the criterion and weights that were used, etc. +Note that Sobol' sequences have an infinite number of points and an unlimited number of dimensions, although the file will give parameters for a finite number of dimensions. + +The other lines of the file specify the primitive polynomials and the initial direction numbers for each dimension $j \ge 2$, one line per dimension. For dimension $j=1$, the generating matrix is the identity and is not given in the file (it is implicit). The columns of this matrix are not obtained via a recurrence based on a primitive polynomial, +so this matrix is handled separately. + +The first number on each line is the dimension $j$. +The second number is the degree $c_j$ of the primitive polynomial $p_j(x)$ used for this dimension. The third number is the integer that corresponds to the binary representation of the inner coefficients of this polynomial (we ignore the first and last coefficients, they are always 1). For example, if the polynomial is $p_j(x) = x^4 + x^3 + 1$, the coefficients are "1 1 0 0 1", and after removing the first and last "1", we get 100 in base 2, which is 4, so the third column would contain the number 4. (Without removing the first and last "1", the number would be 25 instead.) After these three numbers, there are $c_j$ integers $m_{j,1},\dots,m_{j,c_j}$ where $m_{j,c}$ is the $c^\text{th}$ (real-valued) initial direction number for this coordinate, multiplied by $2^c$ to obtain an integer. This $m_{j,c}$ is the integer formed by taking the bits in row 1 to row $c$ of column $c$, in this order. The last bit is the bit on the diagonal, which is always 1, so all $m_{j,c}$'s are odd integers. I think this format comes from Bratley and Fox (1988). + +We denote this format for Sobol parameters by the `soboljk` keyword. One example of a file in this format is shown below. The first line gives the type of file and the next three lines are comments that must be skipped by the reading program. ``` # soboljk @@ -373,17 +194,7 @@ program. 8 5 2 1 1 5 5 17 ``` -The `soboljk` format can be simplified as follows. First, removing the first and -last "1" in the representation of the primitive polynomials saves a bit of -memory, but it also makes thinks slightly more complicated. In the default -representations of the primitive polynomials in the code that generates the -points, these bits are usually not removed. In SSJ, the first thing we do when -reading a file in `soboljk` format is to add them back. Also, the primitive -polynomials can be in a separate file, since they never change, and only the -(initial) direction numbers (those depend on the selected FOM and weights) would -be given to specify the Sobol' points. That is, we remove the first three -columns of the `soboljk` format. The Magic Point Shop (Nuyens, 2020) also -produces files that contain only the direction numbers. +The `soboljk` format can be simplified as follows. First, removing the first and last "1" in the representation of the primitive polynomials saves a bit of memory, but it also makes thinks slightly more complicated. In the default representations of the primitive polynomials in the code that generates the points, these bits are usually not removed. In SSJ, the first thing we do when reading a file in `soboljk` format is to add them back. Also, the primitive polynomials can be in a separate file, since they never change, and only the (initial) direction numbers (those depend on the selected FOM and weights) would be given to specify the Sobol' points. That is, we remove the first three columns of the `soboljk` format. The Magic Point Shop (Nuyens, 2020) also produces files that contain only the direction numbers. One example of a file in this `sobol` format: @@ -400,29 +211,11 @@ One example of a file in this `sobol` format: 1 1 5 5 17 ``` -A list of the first few primitive polynomials in base 2 is given -[here](https://mathworld.wolfram.com/PrimitivePolynomial.html). If we *do not* -remove the first and last 1's in their representations, the first primitive -polynomials are: `3, 7, 11, 13, 19, 25, ...`. Their degrees are `1, 2, 3, 3, 4, -4, ...`. This representation is the one used in the code of SSJ, for example. We -can have a separate file that gives these polynomials, one per line, exactly as -in the first three columns of the `soboljk` format. We may also want to remove -the first column. - -Another, perhaps more convenient, way of storing Sobol' constructions is to just -use the general `dnet` format, in which the generating matrices are given -explicitly. This `dnet` format is easier to use. On the other hand, it requires -specifying a (maximum) value of $k$, and $k$ integers per row to specify the -generating matrices, which leads to larger files. From a file in `sobol` format, -one can construct a digital net with an arbitrarily large $k$. - -When $n = 2^k$ is fixed, so we use exactly $n = 2^k$ points and there is there -no embedding, we can add one *extra dimension* at the beginning by using the -reflected identity as a generating matrix. The successive values for this -coordinate will then be $0, 1/n, 2/n, 3/n, \dots$ in this order. This matrix -will not be given in the file for Sobol' points; the QMC/RQMC software must -handle it. For lattice rules and general digital nets with fixed $n$ -(non-embedded), the file could give a first coordinate with this behavior. +A list of the first few primitive polynomials in base 2 is given [here](https://mathworld.wolfram.com/PrimitivePolynomial.html). If we *do not* remove the first and last 1's in their representations, the first primitive polynomials are: `3, 7, 11, 13, 19, 25, ...`. Their degrees are `1, 2, 3, 3, 4, 4, ...`. This representation is the one used in the code of SSJ, for example. We can have a separate file that gives these polynomials, one per line, exactly as in the first three columns of the `soboljk` format. We may also want to remove the first column. + +Another, perhaps more convenient, way of storing Sobol' constructions is to just use the general `dnet` format, in which the generating matrices are given explicitly. This `dnet` format is easier to use. On the other hand, it requires specifying a (maximum) value of $k$, and $k$ integers per row to specify the generating matrices, which leads to larger files. From a file in `sobol` format, one can construct a digital net with an arbitrarily large $k$. + +When $n = 2^k$ is fixed, so we use exactly $n = 2^k$ points and there is there no embedding, we can add one *extra dimension* at the beginning by using the reflected identity as a generating matrix. The successive values for this coordinate will then be $0, 1/n, 2/n, 3/n, \dots$ in this order. This matrix will not be given in the file for Sobol' points; the QMC/RQMC software must handle it. For lattice rules and general digital nets with fixed $n$ (non-embedded), the file could give a first coordinate with this behavior. ## Files that contain randomizations @@ -441,8 +234,7 @@ A nested uniform scramble in base $b$. A (linear) left matrix scramble in base $b$. -For a `shiftmod1` in $s$ dimensions, the file will contain $s$ in the first -line, followed by $s$ real numbers between 0 and 1, one per line. +For a `shiftmod1` in $s$ dimensions, the file will contain $s$ in the first line, followed by $s$ real numbers between 0 and 1, one per line. ``` # shiftmod1 @@ -453,14 +245,7 @@ line, followed by $s$ real numbers between 0 and 1, one per line. 0.1530364040t106301 ``` -For a `dshift` with $r$ digits of accuracy in base $b$, in $s$ dimensions, the -file will contain $b$ in the first line, $s$ in the second line, $r$ in the -third line, and then $s$ integers from 0 to $b^r-1$, one per line. For the -latter, the digits of the base $b$ representation of the integer divided by -$b^r$ will be added modulo $b$ to the corresponding digits of the base $b$ -representation of the coordinate. For example, if $b=2$ and $r=31$, the -randomization makes a xor of the 31 bits of this integer with the 31 most -significant bits of the corresponding coordinate of each point. +For a `dshift` with $r$ digits of accuracy in base $b$, in $s$ dimensions, the file will contain $b$ in the first line, $s$ in the second line, $r$ in the third line, and then $s$ integers from 0 to $b^r-1$, one per line. For the latter, the digits of the base $b$ representation of the integer divided by $b^r$ will be added modulo $b$ to the corresponding digits of the base $b$ representation of the coordinate. For example, if $b=2$ and $r=31$, the randomization makes a xor of the 31 bits of this integer with the 31 most significant bits of the corresponding coordinate of each point. ``` # dshift @@ -473,18 +258,9 @@ significant bits of the corresponding coordinate of each point. 963462828 ``` -For a `lmscramble` with $r$ digits of accuracy, for $b^k$ points in base $b$ in -$s$ dimensions, we need to store $s$ lower-triangular invertible $r\times r$ -matrices with entries in $\{0,\dots,b-1\}$. For $b=2$, each matrix must have -only 1's on the diagonal and 0's above the diagonal. Each such matrix can be -stored in one line of the file, in exactly the same format as the generating -matrices in the `dnet` format, using one integer for each column. We want them -in this format for the fast LMS implementation we have in SSJ, for example. The -file will contain $b$ in the first non-comment line, $s$ in the second line, $r$ -in the third line, and then $s$ square lower-triangular and invertible $r\times -r$ matrices, one per line, with each column represented as an integer as in the -`dnet` format. Thus, each scrambling matrix is represented by $r$ integers on -the same line. Here is an example, +For a `lmscramble` with $r$ digits of accuracy, for $b^k$ points in base $b$ in $s$ dimensions, we need to store $s$ lower-triangular invertible $r\times r$ matrices with entries in $\{0,\dots,b-1\}$. For $b=2$, each matrix must have only 1's on the diagonal and 0's above the diagonal. Each such matrix can be stored in one line of the file, in exactly the same format as the generating matrices in the `dnet` format, using one integer for each column. We want them in this format for the fast LMS implementation we have in SSJ, for example. The file will contain $b$ in the first non-comment line, $s$ in the second line, $r$ in the third line, +and then $s$ square lower-triangular and invertible $r\times r$ matrices, one per line, with each column represented as an integer as in the `dnet` format. Thus, each scrambling matrix is represented by $r$ integers on the same line. +Here is an example, ``` # lmscramble @@ -498,32 +274,11 @@ the same line. Here is an example, ... ``` -For a `nuscramble` of the first $r \ge k$ digits, for $n=b^k$ points in base $b$ -in $s$ dimensions, with the implementation proposed in Section 3 of Friedel and -Keller (2002) and used for $b=2$ in class `DigitalNetBase2` of SSJ, we need $sn$ -blocks of $r$ random digits in base $b$. Each such block can be represented as -an integer in the range $\{0,1,\dots,b^r-1\}$. For $b=2$, these are $r$ bit -integers. We can store these integers one row per dimension, $n$ integers per -row. This gives the following `nuscramble` file format. The first non-comment -line contains the base $b$, the second line gives the number $s$ of dimensions, -the third line gives the scramble resolution (the number of digits that are -scrambled), and the following $s$ lines give the $sn$ integers used for the -scrambling, $n$ integers per line. Note that this is the same amount of random -numbers that we would need if we use plain Monte Carlo instead of RQMC[^4][^5]. - -[^4]: Another way of storing the NUS is as follows. For each coordinate $j$, - each point can be identified by a $k$ bit integer, and the NUS maps each - such $k$ to a $r$ bit integer that corresponds to the scrambled coordinate - $j$ of this point. So we can simply store this map in an array of size $b^k$ - whose entry $i$ contains the corresponding $r$ bit integer. Applying this - NUS is then fast and straightforward. - -[^5]: Alternative implementations of NUS that use a hashing function in place of - a RNG are proposed in Burley (2020) and Laine and Karras (2011). These - methods might be faster and the is much less information to store to - reproduce a given scramble, but the hashing function must be fixed, known, - and reliable. This essentially amount to fixing the RNG and storing only its - seed. +For a `nuscramble` of the first $r \ge k$ digits, for $n=b^k$ points in base $b$ in $s$ dimensions, with the implementation proposed in Section 3 of Friedel and Keller (2002) and used for $b=2$ in class `DigitalNetBase2` of SSJ, we need $sn$ blocks of $r$ random digits in base $b$. Each such block can be represented as an integer in the range $\{0,1,\dots,b^r-1\}$. For $b=2$, these are $r$ bit integers. We can store these integers one row per dimension, $n$ integers per row. This gives the following `nuscramble` file format. The first non-comment line contains the base $b$, the second line gives the number $s$ of dimensions, the third line gives the scramble resolution (the number of digits that are scrambled), and the following $s$ lines give the $sn$ integers used for the scrambling, $n$ integers per line. Note that this is the same amount of random numbers that we would need if we use plain Monte Carlo instead of RQMC[^4][^5]. + +[^4]: Another way of storing the NUS is as follows. For each coordinate $j$, each point can be identified by a $k$ bit integer, and the NUS maps each such $k$ to a $r$ bit integer that corresponds to the scrambled coordinate $j$ of this point. So we can simply store this map in an array of size $b^k$ whose entry $i$ contains the corresponding $r$ bit integer. Applying this NUS is then fast and straightforward. + +[^5]: Alternative implementations of NUS that use a hashing function in place of a RNG are proposed in Burley (2020) and Laine and Karras (2011). These methods might be faster and the is much less information to store to reproduce a given scramble, but the hashing function must be fixed, known, and reliable. This essentially amount to fixing the RNG and storing only its seed. ``` # nuscramble @@ -540,43 +295,21 @@ numbers that we would need if we use plain Monte Carlo instead of RQMC[^4][^5]. ## File names and other recommendations -It is strongly recommend that all file names start with the corresponding -keyword, like `plattice` for a polynomial lattice rule, `sobol` for a Sobol -point set, and `lmscramble` for a left matrix scramble, for example. +It is strongly recommend that all file names start with the corresponding keyword, like `plattice` for a polynomial lattice rule, `sobol` for a Sobol point set, and `lmscramble` for a left matrix scramble, for example. -It is also recommended to put enough relevant comments in each file for a -knowledgeable human to find what the file is for (type of point set, figure of -merit and weights that were used to construct it, range of values of $n$ for -embedded point sets, etc.). +It is also recommended to put enough relevant comments in each file for a knowledgeable human to find what the file is for (type of point set, figure of merit and weights that were used to construct it, range of values of $n$ for embedded point sets, etc.). -We also want some unit tests: some specific parameter files together with the -correct output that should be observed when generating the points from these -files. +We also want some unit tests: some specific parameter files together with the correct output that should be observed when generating the points from these files. ## References -- P. Bratley and B. L. Fox. Algorithm 659: Implementing Sobol’s quasirandom - sequence generator. ACM Transactions on Mathematical Software, 14(1):88–100, - 1988. -- Brent Burley. Practical hash-based Owen scrambling. The Journal of Computer - Graphics Techniques, 9(4):1–20, 2020. -- I. Friedel and A. Keller. Fast generation of randomized low-discrepancy point - sets. In K.-T. Fang, F. J. Hickernell, and H. Niederreiter, editors, Monte - Carlo and Quasi-Monte Carlo Methods 2000, pages 257–273, Berlin, 2002. - Springer-Verlag. -- T. Goda and J. Dick. Construction of interlaced scrambled polynomial lattice - rules of arbitrary high order. Foundation of Computational Mathematics, - 15:1245–1278, 2015. -- S. Joe and F. Y. Kuo. Constructing Sobol sequences with better two-dimensional - projections. SIAM Journal on Scientific Computing, 30(5):2635–2654, 2008. -- S. Laine and T. Karras. Stratified sampling for stochastic transparency. - Computer Graphics Forum, 30(4):1197–1204, 2011. -- P. L’Ecuyer. SSJ: Stochastic simulation in Java. - http://simul.iro.umontreal.ca/ssj/, accessed 9th August 2021, 2016. -- P. L’Ecuyer, P. Marion, M. Godin, and F. Puchhammer. A tool for custom - construction of QMC and RQMC point sets. In A. Keller, editor, Monte Carlo and - Quasi-Monte Carlo Methods: MCQMC 2020, pages 51–70, Berlin, 2022. Springer. - https://arxiv.org/abs/2012.10263. -- D. Nuyens. The magic point shop, 2020. - https://people.cs.kuleuven.be/~dirk.nuyens/qmc-generators/. +- P. Bratley and B. L. Fox. Algorithm 659: Implementing Sobol’s quasirandom sequence generator. ACM Transactions on Mathematical Software, 14(1):88–100, 1988. +- Brent Burley. Practical hash-based Owen scrambling. The Journal of Computer Graphics Techniques, 9(4):1–20, 2020. +- I. Friedel and A. Keller. Fast generation of randomized low-discrepancy point sets. In K.-T. Fang, F. J. Hickernell, and H. Niederreiter, editors, Monte Carlo and Quasi-Monte Carlo Methods 2000, pages 257–273, Berlin, 2002. Springer-Verlag. +- T. Goda and J. Dick. Construction of interlaced scrambled polynomial lattice rules of arbitrary high order. Foundation of Computational Mathematics, 15:1245–1278, 2015. +- S. Joe and F. Y. Kuo. Constructing Sobol sequences with better two-dimensional projections. SIAM Journal on Scientific Computing, 30(5):2635–2654, 2008. +- S. Laine and T. Karras. Stratified sampling for stochastic transparency. Computer Graphics Forum, 30(4):1197–1204, 2011. +- P. L’Ecuyer. SSJ: Stochastic simulation in Java. http://simul.iro.umontreal.ca/ssj/, accessed 9th August 2021, 2016. +- P. L’Ecuyer, P. Marion, M. Godin, and F. Puchhammer. A tool for custom construction of QMC and RQMC point sets. In A. Keller, editor, Monte Carlo and Quasi-Monte Carlo Methods: MCQMC 2020, pages 51–70, Berlin, 2022. Springer. https://arxiv.org/abs/2012.10263. +- D. Nuyens. The magic point shop, 2020. https://people.cs.kuleuven.be/~dirk.nuyens/qmc-generators/. \ No newline at end of file From 9d25c4bb0074f0c4061e8ffa6977aff82e52d565 Mon Sep 17 00:00:00 2001 From: sou-cheng-choi Date: Thu, 20 Nov 2025 08:59:35 -0600 Subject: [PATCH 06/15] First version of upload.py and new README --- README.md | 778 ++++++++++++++++-------------------------------------- upload.py | 204 ++++++++++++++ 2 files changed, 435 insertions(+), 547 deletions(-) create mode 100644 upload.py diff --git a/README.md b/README.md index 4d0d6f9..f5cbdff 100644 --- a/README.md +++ b/README.md @@ -1,582 +1,266 @@ -# Low Discrepancy Data - -Low discrepancy generating vectors and matrices. - -## Softwares - -- [LatNet Builder](https://github.com/umontreal-simul/latnetbuilder) -- [Magic Point Shop](https://people.cs.kuleuven.be/~dirk.nuyens/qmc-generators/) - -## General ideas and goals - -We propose *standard formats* to specify lattice rules, polynomial lattice -rules, and digital nets, in simple text files. We want the formats to be simple -and relatively compact, with no more than one line per dimension, so they can -easily be used for point sets in several thousand dimensions if desired. -Ordinary text files with decimal numbers are good enough. They are easy to read -by both humans and computers in any language. Other specialized formats (Json or -Parquet, for example) can be more compact but then the files are not as easy to -read without extra tools. - -Joe and Kuo (2008) provide a file for up to 21201 dimensions for Sobol' points, -and this seems to be the most widely used set of parameters for RQMC points at -this time. They use a fixed and very simple format, which requires no special -software to read. We want to provide similar types of files for other types of -point sets, for an arbitrarily large number of dimensions. The SSJ simulation -library (L’Ecuyer, 2016) can already read some of our proposed formats. Latnet -Builder (L’Ecuyer et al., 2022) can (or will) produce output files in these -formats. The choice of output format for Latnet Builder can be specified on the -command line, using the following keywords: - -- `lattice` -A lattice rule: give the modulus and the generating vector. -- `dnet` -A digital net: give the generating matrices, one per line. -- `plattice` -A polynomial lattice rule: give the polynomial modulus and the generating vector. -- `sobol` -Sobol' points (default format), give only the direction numbers. -- `soboljk` -Sobol' points, the format used by Joe and Kuo (2008). - - -The most important formats are the first two, since the point sets covered by -the other formats are special cases of digital nets, so they can all be -described by the `dnet` format. We propose them because they provide alternative -representations that are either more compact or commonly used. - -All the point sets that we consider have the form - -$$P_n = \{\boldsymbol{u}_i \in [0,1)^s,\; i=0,\dots,n-1\}$$ - -where $n$ is the number of points and $s$ is the number of dimensions. The -dimension $j$ goes from 1 to $s$ and there are $n$ points enumerated by $i$ -going from 0 to $n-1$. (In computer code, $j$ usually starts at 0, whereas in -math papers, it starts at 1; one must be careful about this discrepancy.) The -.txt files that contain the parameters have one line per dimension, preceded by -a few lines that contain general parameters, such as $s$, $n$, etc. We shall -call these lines the *header* of the file. In the header, additional lines that -start with `#` can be used for comments and descriptions; these lines are -totally optional and should be just skipped by the program that reads the file. -Anything that starts with `#` on any given line in the header should also be -skipped. All these comments are only for human readers to better see what is in -the file, they are not for the computer[^1]. One exception: the first line of -the file must be a comment that contains the keyword for the file type; for -example `dnet` for a digital net. The number of dimensions (number of lines -after the header) can be much larger than what we usually need; it suffices to -use the number of rows that are needed. - -[^1]: Comments are now allowed only in the header lines, - not in the $s$ lines that follow. This makes more sense. - -The point sets can be extensible in the number of points $n$ or not (they can be -constructed for a single $n$ only). Sobol points are extensible ad infinitum, -although they are very good only when $n$ is a power of 2. Other types of point -sets can also also be extensible, but are usually constructed to be good only -for $n = b^k$ for $k$ in a given integer range, e.g., from 10 to 20, where $b -\ge 2$ is the base. They satisfy the property that $P_{b^k}$ contains -$P_{b^{k-1}}$ for all $k$ in this range. We call them *embedded* point sets. For -these types of point sets it is highly recommended to specify the range in a -comment in the header of the file [^2]. - -[^2]: The range for which the points were built could be given in the file, but - this makes things a bit more complicated for some point sets. For example, - for ordinary lattice rules with a prime number of points, this additional - info might be confusing for some users. For Sobol points, the range has no - limit. - -In the proposed formats, the files do not assume a given computer word size -(e.g., 32 bits or 64 bits). The format is exactly the same regardless of the -word size. Of course, if the file contains integers of more than 32 bits, the -corresponding points cannot be generated properly on a 32 bit computer. A -comment in the file header can say it. - -Some users might prefer input files with no header at all, only the $s$ lines -that give the generating vector or generating matrices. In some languages -(e.g., MATLAB), such a file can be read into a matrix by a simple "load file" -command, so there is no need to to write any code to read the file. Users who -want that can simply strip out the header from the files in standard format and -use these naked files privately. We think that the header with human-readable -comments as imposed by the standard will be very useful to many users. - -The following sections describe the proposed text-file formats for the different -point sets. - - -## Types of point sets and notation - - -### Parameters for ordinary lattice rules: `lattice` - -For an ordinary *lattice rule of rank 1*, we have - -$$P_n = \{\boldsymbol{u}_i = (i \boldsymbol{a} \bmod n)/n,\; i=0,\dots,n-1\}$$ - -where $\boldsymbol{a} = (a_1,\dots,a_s)$ is the generating vector. We must -specify $s$, $n$, and $\boldsymbol{a}$. - -In a `lattice` file, the first line must start with `# lattice`. After that, not -counting the comment lines, the first line gives the number $s$ of dimensions, -the second line gives the number $n$ of points, and lines 3 to $s+2$ give the -coefficients $a_1,\dots, a_s$, one value per line. In the case of embedded -lattice rules, $n$ would usually be a power of 2, say $n=2^k$, and the smaller -embedded lattices will contain the first $2^{k-1}$ points, the first $2^{k-2}$ -points, etc. Additional comments in the file should tell when the lattice is -embedded, which figure of merit and what weights were used, the construction -method, etc. [^3] - -[^3]: Should we be forced to always put the embedding range in the file for the - computer to read? My suggestion is not to force the computer to read it, - but just put it as a comment for humans, Otherwise, it will force an - additional line that gives the base and the range. What would we put for - example if $n$ is prime and the rule is not embedded? Should we have a - `lattice2` format for embedded rules in base 2? Putting more options makes - things more complicated. - -One example of a parameter file for an ordinary lattice rule, in `lattice` -format is given below. In this file, the first line is skipped, only the number -`8` is read on the second line, only the number `65536` is read on the second -line, etc. +# LDData: Low-Discrepancy Generating Vectors and Matrices -``` -# lattice -# A lattice rule, non-embedded, in 'lattice' format -8 # 8 dimensions -65536 # modulus = n = 65536 points -# coordinates of the generating vector, starting at j=1: -1 -19463 -17213 -5895 -14865 -31925 -30921 -26671 -``` +A curated collection of **low-discrepancy point set parameters** including **lattice rules**, **digital nets**, **polynomial lattice rules**, **Sobol' nets**, and **RQMC randomizations**. This dataset enables reproducible research and high-performance Quasi–Monte Carlo (QMC) and Randomized QMC (RQMC) simulation. -### Parameters for digital nets: `dnet` - -A *digital net in base* $b$ with $n=b^k$ points is defined by selecting integers -$s \geq 1$, $r \geq k \geq 1$, and $s$ matrices -$\boldsymbol{C}_1,\dots,\boldsymbol{C}_s$ of size $r \times k$ with entries in -$\mathbb{Z}_b$, called the generating matrices. For $i=0,\dots,n-1$, let $i = -\sum_{\ell=0}^{k-1} a_{i,\ell} b^\ell$ be the expansion of $i$ in base $b$, and -for $j=1,\dots s$, let - -$$(y_{i, j, 1}, \dots, y_{i, j, r})^T = \boldsymbol{C}_j \cdot (a_{i, 0}, \dots, a_{i, k-1})^T$$ - -and - -$$u_{i, j} = \sum_{\ell=1}^{r} y_{i, j, \ell} b^{-\ell}.$$ - -The points $\boldsymbol{u}_i$ are defined by $\boldsymbol{u}_i = -(u_{i,1},\dots,u_{i,s})$. Digital nets are usually in base $b=2$, but we allow a -general (typically prime) base $b \ge 2$. - -The proposed format to specify digital nets is as follows. The first line must -start with `# dnet`. Then the first four non-comment lines give $b$ (the base), -$s$ (the number of dimensions), $k$ (the number of columns), and $r$ (the number -of rows in the generating matrices in base $b$). Thus, the output values will -have "precision" $b^{-r}$ (they will be integer multiples of $b^{-r}$). For -$b=2$, a common value in the past has been $r=31$ when using 32 bit integers, -but going forward we should use 64 bit integers and $r=63$ or 64, or perhaps -$r=53$ to exploit the full accuracy of a `double`. By looking at $r$, one can -see right away whether this file is good for 64 bit computers only or for 32 bit -computers as well. - -The $s$ lines after this header will contain the $s$ generating matrices, one -per line. Each of these lines contains $k$ integers smaller than $b^r$ giving -the $k$ columns of $\boldsymbol{C}_j$, using by default the same encoding as in -the class `DigitalNetBase2` in SSJ for $b=2$. That is, the base $b$ -representation of the integer gives the $r$ digits in the corresponding column, -with the digit on the first row of the matrix (row 0) being the most -significant, and the one on the last row (row $r-1$) being the least -significant. For example, if $b=2$, $r=31$, and the first column has a 1 in the -first row and 0 in all other rows, as is always the case for Sobol points, then -the integer representation of this column will be $2^{30} = 1\,073\,741\,824$. -If there is a 1 in the last row and 0 elsewhere, the representation will be $2^0 -= 1$. If all 31 elements of the column are 1, the representation will be -$2^{31}-1$. - -One example of a file for a digital net in `dnet` format: +The [LDData repository](https://github.com/QMCSoftware/LDData) provides **standard text-based formats** for specifying structures used in QMC point generation across arbitrarily high dimensions. -``` -# dnet -# A digital net in base 2, in 'dnet' format -2 # basis b = 2 -8 # s = 8 dimensions -10 # k = 10, so n = 2^10 = 1024 points -31 # r = 31 digits -# The columns of gen. matrices C_1, ..., C_s, one matrix per line: -1073741824 536870912 268435456 134217728 ... -2012537125 1382645254 ... -... +--- + +## Dataset Summary + +LDData is a dataset of structured parameter files defining: + +- Rank-1 **lattice rules** +- Base-$b$ **digital nets** +- **Polynomial lattice rules** +- **Sobol' and Sobol–Joe-Kuo sequences** +- Various **randomizations** (shift modulo 1, digital shifts, nested uniform + scrambles, left matrix scrambles) + +Each file type follows a simple textual standard to ensure: +- Human readability +- Language-agnostic parsing +- Long-term reproducibility +- Extensibility to thousands of dimensions + +The dataset is motivated by the need for **standardized, compact, transparent formats** in [our QMC research and software](https://github.com/QMCSoftware). + +--- + +## Motivation + +Many QMC constructions appear across scattered software packages, papers, or custom formats. LDData brings these formats together into a **consistent, unified, machine-readable** repository for: + +- Researchers developing new QMC methods +- Practitioners needing high-dimensional low-discrepancy point sets +- Developers of simulation libraries such as SSJ, QMCPy, and LatNet Builder + +This dataset is linked to the research works described in the Citation section below. **For detailed technical specifications and implementation details**, see +[LD_DATA.md](LD_DATA.md) + +--- + +## Supported Tasks and Applications + +### ✔️ Quasi-Monte Carlo (QMC) +Generate deterministic point sets with excellent equidistribution. + +### ✔️ Randomized QMC (RQMC) +Use the included randomizations for variance estimation: +- Digital shifts +- Nested uniform scrambles +- Left-matrix scrambles + +### ✔️ High-dimensional Integration and Simulation + +Used in: +- Bayesian computation +- Option pricing +- High-dimensional PDE solvers +- Uncertainty quantification +- Graphics and rendering research +- Machine learning sampling methods + +### ✔️ Benchmarking +Standard formats help evaluate new constructions against established ones. + +--- + +## Features + +- Simple `.txt` formats with **one line per dimension** +- Optional human-readable comments starting with `#` +- No binary encoding or word-size assumptions +- Supports extremely high dimensions (10,000+) +- Extensible constructions (e.g., Sobol or embedded nets) +- All formats interoperable with QMC software (SSJ, QMCPy, LatNet Builder) + +--- + +## How to Use the Dataset + +### Load files directly from Hugging Face + +```python +from datasets import load_dataset + +ds = load_dataset("QMCSoftware/LDData") ``` -This differs from Joe and Kuo (2008), where the $c$-th column (for $c\ge 1$) is -represented by an integer smaller than $2^c$ (in base 2) and the least -significant bit is the one on the diagonal. Their representation works when -$\boldsymbol{C}_j$ is upper triangular, which is true for Sobol point sets, but -not for digital nets in general. - -Recall that coordinate $j$ of the $i$-th point is obtained by multiplying the -base $b$ matrix $\boldsymbol{C}_j$ by the vector of digits of the representation -of $i$ in base $b$, with the least significant digits of $i$ at the top. That -is, the least significant digit of $i$ goes with the first column of -$\boldsymbol{C}_j$. And the first row of $\boldsymbol{C}_j$ is for the most -significant digit of output. With our representation of $\boldsymbol{C}_j$ by -$k$ integers, the points are easy and fast to generate in base 2. We obtain -`u[i,j]`, coordinate $j$ of point $i$, with the following code snippet, in which -`C[j,c]` is the integer that represents column $c$ of $\boldsymbol{C}_j$: +All data files are preserved in their directory structure and can be accessed +using: ```python -normFactor = 1.0 / (1 << r) # 2^(-r) -coord = 0 -for c in range(k): - coord ^= ((i >> c) & 1) * C[j,c] - u[i,j] = coord * normFactor +ds["train"] # or ds['default'] ``` +### Typical workflow + +1. Read a parameter file (e.g. `lattice_8d.txt`) +2. Parse header (`# lattice`, dimensions, n, etc.) +3. Parse one line per dimension for the generating vector or matrices +4. Construct QMC point generator in your preferred library + +--- + +## Dataset Structure -### Parameters for polynomial lattice rules: `plattice` - -*Polynomial lattice rules* are a special type of digital nets with generating -matrices of a special form. For a polynomial lattice rule of rank 1 in a prime -base $b$, we have - -$$P_n = \left\{\left(\varphi\left(\frac{h(z) a_1(z)}{Q(z)}\right), \dots, -\varphi\left(\frac{h(z) a_s(z)}{Q(z)}\right)\right) : h(z)\in\mathbb{F}_b[z], \ -\text{degree}(h(z)) < k\right\}.$$ - -where $\mathbb{F}_b[z]$ is the space of polynomials with coefficients in -$\mathbb{F}_b$, the *modulus* $Q(z) \in \mathbb{F}_b[z]$ is a polynomial of -degree $k$, the *generating vector* $\boldsymbol{a}(z) = -(a_1(z),\dots,a_s(z))\in \mathbb{F}_b[z]^s$ is a vector of $s$ polynomials of -degrees less than $k$, and the mapping $\varphi$ is defined by - -$$\varphi\left(\sum_{l = w}^{\infty} x_l z^{-l}\right) = \sum_{l = \max(w, -1)}^{\infty} x_l b^{-l}.$$ - -This point set has $n = b^k$ points. - -We must specify the polynomial modulus $Q(z)$ and the polynomial generating -vector $\boldsymbol{a}(z)$. The polynomial modulus will be represented as an -integer that has $(k+1)$ digits in base $b$, and all the other polynomials will -be represented as integers that have no more than $k$ digits in base $b$. All -these integers will be given in base 10 in the file, one per line. In practice, -we usually have $b=2$, so $k$ represents the number of bits. The integer that -represents a polynomial is obtained simply by replacing the formal variable by -$b$. For example, if the polynomial is $Q(z) = z^4 + z^3 + 1$ and $b=2$, its -coefficients are "1 1 0 0 1" and its integer representation is $2^4 + 2^3 + 1 = -25$. This is the usual representation, as used in Goda and Dick (2015), for -example. In the case of embedded point sets, the modulus should be $Q(z) = z^k$ -for $n=b^k$ points, and its integer representation is $b^k$. In particular, -$Q(z) = z$ is represented by the integer $b$. - -As usual, the first line is a comment that tells the type of file. Then the -first four non-comment lines give the base $b$, the number $s$ of dimensions, -the degree $k$ of the polynomial modulus, and the integer representation of this -polynomial. Lines 5 to $s+4$ give the polynomials that form the generating -vector, one per line, using the integer representation just explained. One -example of a file for a polynomial lattice in the `plattice` format: +The dataset includes multiple categories of files: + +### 🔹 `lattice` +Rank-1 lattice generating vectors: +- Header: `# lattice` +- Parameters: + - Number of dimensions `s` + - Number of points `n` + - `s` lines of generating vector coefficients + +--- + +### 🔹 `dnet` +General digital nets in base `b`: +- Header: `# dnet` +- Parameters: + - Base `b` + - Dimensions `s` + - Columns `k` + - Rows `r` +- Then `s` lines representing generating matrices + +Efficient for high-dimensional digital nets. + +--- + +### 🔹 `plattice` +Polynomial lattice rules: +- Compact format using integer-encoded polynomials +- Base `b`, dimension `s`, polynomial degree `k`, and generating polynomials + +--- + +### 🔹 `sobol` and `soboljk` +Parameters for Sobol' sequences: +- `soboljk`: Joe & Kuo format with primitive polynomials and direction numbers +- `sobol`: Simplified direction-number only format + +Used widely in QMC applications. + +--- + +### 🔹 Randomization formats + +Includes: + +- `shiftmod1`: Shift modulo 1 +- `dshift`: Digital shift in base `b` +- `nuscramble`: Nested uniform scramble +- `lmscramble`: Left matrix scramble + +All formats are text-based and reproducible. + +--- + +## Example: Parsing a Lattice Rule File + +Example file: ``` -# plattice -# A polynomial lattice rule in base 2, in 'plattice' format -2 # base b = 2 -8 # s = 8 dimensions -16 # n = 2^16 = 65536 points -45781 # polynomial modulus -# coordinates of the generating vector, starting at j=1: -1 +# lattice +8 +65536 +1 +19463 17213 5895 14865 31925 30921 -26671 -17213 +26671 ``` -A polynomial lattice rule in base $b$ can also be represented as a digital net -in base $b$, so its parameters can also be provided in a file in the `dnet` -format, as for general digital net in base $b$. But the generating matrices have -a special form and the above representation is much more compact (a single -integer per row instead of $k$ integers per row). On the other hand, generating -the points is faster with the generating matrices than with the polynomial -representation, so the software that will use the `plattice` files and generate -the points would usually first convert the polynomials into the corresponding -generating matrices. LatNet Builder (L’Ecuyer et al., 2022) is also able to make -the conversion and produce a file in the `dnet` format, for more convenience and -better flexibility, so the user can select the format she/he prefers. - -### Parameters for Sobol nets: `sobol` and `soboljk` - -The Sobol' construction provides another special case of digital nets (and -sequences), in base 2. They are defined in many places, including Joe and Kuo -(2008). For each coordinate $j$, we select a primitive polynomial $p_j(z)$ of -degree $c_j$, and $c_j$ integers $m_{j,1},\dots,m_{j,c_j}$ which are used to -define the generating matrix $\boldsymbol{C}_j$. The real numbers $2^{-c} -m_{j,c}$ are called the initial *direction numbers*. More details are given in -Joe and Kuo (2008) and -[here](http://umontreal-simul.github.io/ssj/docs/master/classumontreal_1_1ssj_1_1hups_1_1SobolSequence.html). - -One obvious option for these point sets is to adopt exactly the same format as -Joe and Kuo (2008), because it is already used in many places. The only -difference is that we now allow comment lines in the file. In the format of Joe -and Kuo (2008), only the first line is skipped. In the proposed format, other -comment lines can be added at the beginning of the file, e.g., to give the -maximum number of dimensions in the file, the criterion and weights that were -used, etc. Note that Sobol' sequences have an infinite number of points and an -unlimited number of dimensions, although the file will give parameters for a -finite number of dimensions. - -The other lines of the file specify the primitive polynomials and the initial -direction numbers for each dimension $j \ge 2$, one line per dimension. For -dimension $j=1$, the generating matrix is the identity and is not given in the -file (it is implicit). The columns of this matrix are not obtained via a -recurrence based on a primitive polynomial, so this matrix is handled -separately. - -The first number on each line is the dimension $j$. The second number is the -degree $c_j$ of the primitive polynomial $p_j(x)$ used for this dimension. The -third number is the integer that corresponds to the binary representation of the -inner coefficients of this polynomial (we ignore the first and last -coefficients, they are always 1). For example, if the polynomial is $p_j(x) = -x^4 + x^3 + 1$, the coefficients are "1 1 0 0 1", and after removing the first -and last "1", we get 100 in base 2, which is 4, so the third column would -contain the number 4. (Without removing the first and last "1", the number would -be 25 instead.) After these three numbers, there are $c_j$ integers -$m_{j,1},\dots,m_{j,c_j}$ where $m_{j,c}$ is the $c^\text{th}$ (real-valued) -initial direction number for this coordinate, multiplied by $2^c$ to obtain an -integer. This $m_{j,c}$ is the integer formed by taking the bits in row 1 to row -$c$ of column $c$, in this order. The last bit is the bit on the diagonal, which -is always 1, so all $m_{j,c}$'s are odd integers. I think this format comes from -Bratley and Fox (1988). - -We denote this format for Sobol parameters by the `soboljk` keyword. One example -of a file in this format is shown below. The first line gives the type of file -and the next three lines are comments that must be skipped by the reading -program. +Python pseudo-code: -``` -# soboljk -# Parameters for Sobol points, in 'soboljk' format -# 8 dimensions -# c_j p_j m_{j,c} -2 1 0 1 -3 2 1 1 3 -4 3 1 1 3 1 -5 3 2 1 1 1 -6 4 1 1 1 3 3 -7 4 4 1 3 5 13 -8 5 2 1 1 5 5 17 +```python +with open("lattice_8d.txt") as f: + lines = [l for l in f.readlines() if not l.startswith("#")] + +s = int(lines[0]) +n = int(lines[1]) +a = [int(x) for x in lines[2:2+s]] ``` -The `soboljk` format can be simplified as follows. First, removing the first and -last "1" in the representation of the primitive polynomials saves a bit of -memory, but it also makes thinks slightly more complicated. In the default -representations of the primitive polynomials in the code that generates the -points, these bits are usually not removed. In SSJ, the first thing we do when -reading a file in `soboljk` format is to add them back. Also, the primitive -polynomials can be in a separate file, since they never change, and only the -(initial) direction numbers (those depend on the selected FOM and weights) would -be given to specify the Sobol' points. That is, we remove the first three -columns of the `soboljk` format. The Magic Point Shop (Nuyens, 2020) also -produces files that contain only the direction numbers. +--- -One example of a file in this `sobol` format: +## File Naming Recommendations -``` -# sobol -# Parameters m_{j,c} for Sobol points, in 'sobol' format -# 8 dimensions -1 # This is m_{j,c} for the second coordinate -1 3 -1 3 1 -1 1 1 -1 1 3 3 -1 3 5 13 -1 1 5 5 17 -``` +To support discoverability and consistent tooling: -A list of the first few primitive polynomials in base 2 is given -[here](https://mathworld.wolfram.com/PrimitivePolynomial.html). If we *do not* -remove the first and last 1's in their representations, the first primitive -polynomials are: `3, 7, 11, 13, 19, 25, ...`. Their degrees are `1, 2, 3, 3, 4, -4, ...`. This representation is the one used in the code of SSJ, for example. We -can have a separate file that gives these polynomials, one per line, exactly as -in the first three columns of the `soboljk` format. We may also want to remove -the first column. - -Another, perhaps more convenient, way of storing Sobol' constructions is to just -use the general `dnet` format, in which the generating matrices are given -explicitly. This `dnet` format is easier to use. On the other hand, it requires -specifying a (maximum) value of $k$, and $k$ integers per row to specify the -generating matrices, which leads to larger files. From a file in `sobol` format, -one can construct a digital net with an arbitrarily large $k$. - -When $n = 2^k$ is fixed, so we use exactly $n = 2^k$ points and there is there -no embedding, we can add one *extra dimension* at the beginning by using the -reflected identity as a generating matrix. The successive values for this -coordinate will then be $0, 1/n, 2/n, 3/n, \dots$ in this order. This matrix -will not be given in the file for Sobol' points; the QMC/RQMC software must -handle it. For lattice rules and general digital nets with fixed $n$ -(non-embedded), the file could give a first coordinate with this behavior. - -## Files that contain randomizations - -The idea of proposing a format for storing specific randomizations was suggested by Fred Hickernell. This can be useful for verification purposes, for example. - -We can store randomizations in the following file formats: - -- `shiftmod1` -A (random) shift modulo 1. It corresponds to a single point in $[0,1)^s$. -- `dshift` -A digital shift in base $b$. -Also a single point in $[0,1)^s$, but with $r$ digits in base $b$. -- `nuscramble` -A nested uniform scramble in base $b$. -- `lmscramble` -A (linear) left matrix scramble in base $b$. - - -For a `shiftmod1` in $s$ dimensions, the file will contain $s$ in the first -line, followed by $s$ real numbers between 0 and 1, one per line. +- All files begin with their keyword (`lattice_`, `dnet_`, `sobol_`, etc.) +- Headers contain: + - Construction method + - Figure of merit (FOM) + - Weights + - Embedded range (if applicable) +- Comments allowed in headers only -``` -# shiftmod1 -# A shift modulo 1, in 'shiftmod1' format -3 # s = 3 dimensions -0.32638741823951621 -0.91325392536931693 -0.1530364040t106301 -``` +--- -For a `dshift` with $r$ digits of accuracy in base $b$, in $s$ dimensions, the -file will contain $b$ in the first line, $s$ in the second line, $r$ in the -third line, and then $s$ integers from 0 to $b^r-1$, one per line. For the -latter, the digits of the base $b$ representation of the integer divided by -$b^r$ will be added modulo $b$ to the corresponding digits of the base $b$ -representation of the coordinate. For example, if $b=2$ and $r=31$, the -randomization makes a xor of the 31 bits of this integer with the 31 most -significant bits of the corresponding coordinate of each point. +## References -``` -# dshift -# A digital shift in base 2, in 'dshift' format -2 # b = 2 -3 # s = 3 -31 # r = 31 -2146832861 -1084390381 -963462828 -``` +This dataset incorporates formats and ideas from foundational work in QMC: -For a `lmscramble` with $r$ digits of accuracy, for $b^k$ points in base $b$ in -$s$ dimensions, we need to store $s$ lower-triangular invertible $r\times r$ -matrices with entries in $\{0,\dots,b-1\}$. For $b=2$, each matrix must have -only 1's on the diagonal and 0's above the diagonal. Each such matrix can be -stored in one line of the file, in exactly the same format as the generating -matrices in the `dnet` format, using one integer for each column. We want them -in this format for the fast LMS implementation we have in SSJ, for example. The -file will contain $b$ in the first non-comment line, $s$ in the second line, $r$ -in the third line, and then $s$ square lower-triangular and invertible $r\times -r$ matrices, one per line, with each column represented as an integer as in the -`dnet` format. Thus, each scrambling matrix is represented by $r$ integers on -the same line. Here is an example, +- Bratley & Fox (1988) +- Joe & Kuo (2008) +- L’Ecuyer (2016) +- Goda & Dick (2015) +- Nuyens (2020) +- And others listed in the detailed specification [LD_DATA.md](LD_DATA.md). -``` -# lmscramble -# A left matrix scramble in base 2, with 31 digits of resolution. -2 # basis b = 2 -8 # s = 8 dimensions -31 # r = 31 digits -# The columns of the lower-triangular r x r scrambling matrices, one matrix per line: -1673741824 906870912 615843556 213427728 ... -2012537125 1012645254 ... -... -``` +--- + +## Citation -For a `nuscramble` of the first $r \ge k$ digits, for $n=b^k$ points in base $b$ -in $s$ dimensions, with the implementation proposed in Section 3 of Friedel and -Keller (2002) and used for $b=2$ in class `DigitalNetBase2` of SSJ, we need $sn$ -blocks of $r$ random digits in base $b$. Each such block can be represented as -an integer in the range $\{0,1,\dots,b^r-1\}$. For $b=2$, these are $r$ bit -integers. We can store these integers one row per dimension, $n$ integers per -row. This gives the following `nuscramble` file format. The first non-comment -line contains the base $b$, the second line gives the number $s$ of dimensions, -the third line gives the scramble resolution (the number of digits that are -scrambled), and the following $s$ lines give the $sn$ integers used for the -scrambling, $n$ integers per line. Note that this is the same amount of random -numbers that we would need if we use plain Monte Carlo instead of RQMC[^4][^5]. - -[^4]: Another way of storing the NUS is as follows. For each coordinate $j$, - each point can be identified by a $k$ bit integer, and the NUS maps each - such $k$ to a $r$ bit integer that corresponds to the scrambled coordinate - $j$ of this point. So we can simply store this map in an array of size $b^k$ - whose entry $i$ contains the corresponding $r$ bit integer. Applying this - NUS is then fast and straightforward. - -[^5]: Alternative implementations of NUS that use a hashing function in place of - a RNG are proposed in Burley (2020) and Laine and Karras (2011). These - methods might be faster and the is much less information to store to - reproduce a given scramble, but the hashing function must be fixed, known, - and reliable. This essentially amount to fixing the RNG and storing only its - seed. +If you use LDData in academic work, please cite: ``` -# nuscramble -# A nested uniform scramble in base 2, with 30 bits of resolution. -2 # basis b = 2 -8 # s = 8 dimensions -10 # k = 10, so n = 2^10 = 1024 points -30 # r = 30 digits -# The following s rows contain n = 1024 30 bit integers per row: -1173741824 906870912 615843556 213427728 ... -1012537125 1001975254 ... -... +@article{sorokin.2025.ld_randomizations_ho_nets_fast_kernel_mats, + title = {{QMCPy}: a {P}ython software for randomized low-discrepancy sequences, quasi-{M}onte {C}arlo, and fast kernel methods}, + author = {Aleksei G. Sorokin}, + year = {2025}, + journal = {ArXiv preprint}, + volume = {abs/2502.14256}, + url = {https://arxiv.org/abs/2502.14256}, +} + + +@inproceedings{choi.QMC_software, + title = {Quasi-{M}onte {C}arlo software}, + author = {Choi, Sou-Cheng T. and Hickernell, Fred J. and Rathinavel, Jagadeeswaran and McCourt, Michael J. and Sorokin, Aleksei G.}, + year = {2022}, + booktitle = {{M}onte {C}arlo and Quasi-{M}onte {C}arlo Methods 2020}, + publisher = {Springer International Publishing}, + address = {Cham}, + pages = {23--47}, + isbn = {978-3-030-98319-2}, + editor = {Keller, Alexander}, +} ``` -## File names and other recommendations - -It is strongly recommend that all file names start with the corresponding -keyword, like `plattice` for a polynomial lattice rule, `sobol` for a Sobol -point set, and `lmscramble` for a left matrix scramble, for example. - -It is also recommended to put enough relevant comments in each file for a -knowledgeable human to find what the file is for (type of point set, figure of -merit and weights that were used to construct it, range of values of $n$ for -embedded point sets, etc.). - - -We also want some unit tests: some specific parameter files together with the -correct output that should be observed when generating the points from these -files. - -## References - -- P. Bratley and B. L. Fox. Algorithm 659: Implementing Sobol’s quasirandom - sequence generator. ACM Transactions on Mathematical Software, 14(1):88–100, - 1988. -- Brent Burley. Practical hash-based Owen scrambling. The Journal of Computer - Graphics Techniques, 9(4):1–20, 2020. -- I. Friedel and A. Keller. Fast generation of randomized low-discrepancy point - sets. In K.-T. Fang, F. J. Hickernell, and H. Niederreiter, editors, Monte - Carlo and Quasi-Monte Carlo Methods 2000, pages 257–273, Berlin, 2002. - Springer-Verlag. -- T. Goda and J. Dick. Construction of interlaced scrambled polynomial lattice - rules of arbitrary high order. Foundation of Computational Mathematics, - 15:1245–1278, 2015. -- S. Joe and F. Y. Kuo. Constructing Sobol sequences with better two-dimensional - projections. SIAM Journal on Scientific Computing, 30(5):2635–2654, 2008. -- S. Laine and T. Karras. Stratified sampling for stochastic transparency. - Computer Graphics Forum, 30(4):1197–1204, 2011. -- P. L’Ecuyer. SSJ: Stochastic simulation in Java. - http://simul.iro.umontreal.ca/ssj/, accessed 9th August 2021, 2016. -- P. L’Ecuyer, P. Marion, M. Godin, and F. Puchhammer. A tool for custom - construction of QMC and RQMC point sets. In A. Keller, editor, Monte Carlo and - Quasi-Monte Carlo Methods: MCQMC 2020, pages 51–70, Berlin, 2022. Springer. - https://arxiv.org/abs/2012.10263. -- D. Nuyens. The magic point shop, 2020. - https://people.cs.kuleuven.be/~dirk.nuyens/qmc-generators/. +--- + +## License + +Apache 2 License. +See [`LICENSE`](LICENSE.txt) file for details. + +--- + +## Acknowledgements + +This dataset is developed and maintained by: + +- **QMCSoftware team** +- Contributors to QMCPy, SSJ, and LatNet Builder +- Community contributions from QMC & RQMC researchers + +Special thanks to researchers providing widely used generating vectors and direction numbers used throughout the scientific computing community. diff --git a/upload.py b/upload.py new file mode 100644 index 0000000..e5b8c26 --- /dev/null +++ b/upload.py @@ -0,0 +1,204 @@ +#!/usr/bin/env python +""" +upload.py + +Upload the QMCSoftware/LDData repository (low discrepancy generating vectors +and matrices) to the Hugging Face Datasets Hub as a dataset repo. + +This script: + +1. Creates (or reuses) a dataset repo on the Hub. +2. Uploads all files from a local LDData checkout using `upload_folder`. +3. Leaves README.md in place so it becomes the dataset card. + After upload you can edit the card in the web UI to: + - Link to the paper (arXiv:2502.14256). + - Add "Citation" and "Uses" sections, similar to + - facebook/omnilingual-asr-corpus + - nvidia/PhysicalAI-Autonomous-Vehicles + - moondream/refcoco-m + +Requirements: + pip install "huggingface_hub>=0.32.0" + +Authentication: + - Either set HF_TOKEN in your environment: + export HF_TOKEN=hf_xxx... + OR pass --token on the command line. + +Example usage: + python upload.py \ + --repo-id QMCSoftware/LDData \ + --local-path /path/to/local/LDData + +After upload you’ll be able to do, e.g.: + + from datasets import load_dataset + ds = load_dataset("QMCSoftware/LDData") + +and link the dataset to your paper page on Hugging Face. +""" + +import argparse +import os +import sys +from pathlib import Path +import httpx + +from huggingface_hub import HfApi, create_repo # type: ignore + + +def parse_args() -> argparse.Namespace: + parser = argparse.ArgumentParser( + description="Upload LDData to the Hugging Face Datasets Hub." + ) + parser.add_argument( + "--repo-id", + type=str, + default="QMCSoftware/LDData", + help="Target dataset repo id on the Hub (e.g. 'QMCSoftware/LDData').", + ) + parser.add_argument( + "--local-path", + type=str, + default=".", + help="Path to local LDData checkout (default: current directory).", + ) + parser.add_argument( + "--token", + type=str, + default=None, + help="Hugging Face access token. If omitted, HF_TOKEN env var is used.", + ) + parser.add_argument( + "--private", + action="store_true", + help="Create the dataset repo as private (default: public).", + ) + parser.add_argument( + "--dry-run", + action="store_true", + help="Do not upload, just print what would be done.", + ) + return parser.parse_args() + + +def get_token(cmd_token: str | None) -> str: + token = cmd_token or os.environ.get("HF_TOKEN") + if not token: + raise SystemExit( + "No token provided. Please either:\n" + " - set HF_TOKEN in your environment, or\n" + " - pass --token hf_xxx... on the command line." + ) + return token + + +def main() -> None: + args = parse_args() + + local_path = Path(args.local_path).expanduser().resolve() + if not local_path.exists(): + raise SystemExit(f"Local path does not exist: {local_path}") + + # Sanity check: are we in LDData? + readme = local_path / "README.md" + if not readme.exists(): + print( + f"WARNING: {readme} does not exist. " + "Are you sure this is the LDData repo root?", + file=sys.stderr, + ) + + token = get_token(args.token) + repo_id = args.repo_id + + print(f"Using repo_id: {repo_id}") + print(f"Local path : {local_path}") + print(f"Private : {args.private}") + if args.dry_run: + print("Dry run enabled: NOT creating or uploading, just showing intent.") + return + + # Initialize API client + api = HfApi(token=token) + + # 1. Create (or reuse) the dataset repo on the Hub + print(f"Creating (or reusing) dataset repo '{repo_id}' on the Hub...") + create_repo( + repo_id=repo_id, + repo_type="dataset", + private=args.private, + exist_ok=True, + token=token, + ) + + # 2. Upload folder contents + # We ignore some typical non-data files to keep the repo clean. + # Adjust this list if you want to exclude more or fewer things. + ignore_patterns = [ + ".git/*", + ".gitignore", + ".DS_Store", + "__pycache__/*", + "*.pyc", + "*.pyo", + "*~", + "*.ipynb_checkpoints*", + "sc", + "raw.githubusercontent.com" + # If you do NOT want to upload the demo notebook or env file, keep these: + # "LDData Demo.ipynb", + # "env.yml", + ] + + print("Uploading local folder to the Hub (this may take a while)...") + + # NOTE: + # Older/newer versions of huggingface_hub's HfApi.upload_folder do not accept + # a `timeout` keyword argument. Passing it raises: + # TypeError: HfApi.upload_folder() got an unexpected keyword argument 'timeout' + # To remain compatible, call upload_folder without a `timeout` kwarg and + # handle httpx.ReadTimeout explicitly. + try: + api.upload_folder( + folder_path=str(local_path), + repo_id=repo_id, + repo_type="dataset", + ignore_patterns=ignore_patterns, + ) + except httpx.ReadTimeout: + print() + print("ERROR: Upload read timed out.") + print("Possible actions:") + print(" - Check your network connection and try again.") + print(" - Try uploading in smaller batches (split large files or directories).") + print(" - Upgrade huggingface_hub to the latest version in case it adds improved timeout handling.") + print(" - If you have very large files, consider using git-lfs or the web UI.") + sys.exit(1) + except Exception as exc: + print() + print("ERROR: Upload failed with an unexpected error:") + print(f" {exc!r}") + sys.exit(1) + + dataset_url = f"https://huggingface.co/datasets/{repo_id}" + print() + print("✅ Upload complete.") + print(f"Dataset is now available at: {dataset_url}") + print() + print("Next steps (recommended):") + print(" 1. Open the dataset page above in your browser.") + print(" 2. Edit the Dataset Card (README.md) to:") + print(" - Add paper links (e.g., your QMCSoftware/LDData arXiv paper).") + print(" - Add a 'Citations' section.") + print(" - Add 'Uses' and 'Limitations' sections, similar to:") + print(" - facebook/omnilingual-asr-corpus") + print(" - nvidia/PhysicalAI-Autonomous-Vehicles") + print(" - moondream/refcoco-m") + print(" 3. Use 'Paper' / 'Dataset' linking in the Hugging Face UI to") + print(" attach the dataset to your paper so it shows up on the") + print(" paper page and in discovery views.") + + +if __name__ == "__main__": + main() \ No newline at end of file From d6b113030c87e5983913934719f2f2dc77807be2 Mon Sep 17 00:00:00 2001 From: sou-cheng-choi Date: Thu, 20 Nov 2025 09:16:33 -0600 Subject: [PATCH 07/15] +reset option --- .gitignore | 1 + upload.py | 122 ++++++++++++++++++++++++++++++++++++++++++++++++----- 2 files changed, 113 insertions(+), 10 deletions(-) diff --git a/.gitignore b/.gitignore index 5bdb815..57816f9 100644 --- a/.gitignore +++ b/.gitignore @@ -4,3 +4,4 @@ raw.githubusercontent.com/* *.ipynb-checkpoints /sc .vscode/settings.json +*.pyc diff --git a/upload.py b/upload.py index e5b8c26..b47a941 100644 --- a/upload.py +++ b/upload.py @@ -43,6 +43,7 @@ import sys from pathlib import Path import httpx +import fnmatch from huggingface_hub import HfApi, create_repo # type: ignore @@ -79,6 +80,16 @@ def parse_args() -> argparse.Namespace: action="store_true", help="Do not upload, just print what would be done.", ) + parser.add_argument( + "--reset-remote", + action="store_true", + help="Delete the remote dataset repo on the Hub before uploading (destructive).", + ) + parser.add_argument( + "--yes", + action="store_true", + help="Answer yes to any interactive confirmation prompts (use with care).", + ) return parser.parse_args() @@ -136,6 +147,7 @@ def main() -> None: # We ignore some typical non-data files to keep the repo clean. # Adjust this list if you want to exclude more or fewer things. ignore_patterns = [ + "sc/*", ".git/*", ".gitignore", ".DS_Store", @@ -144,7 +156,6 @@ def main() -> None: "*.pyo", "*~", "*.ipynb_checkpoints*", - "sc", "raw.githubusercontent.com" # If you do NOT want to upload the demo notebook or env file, keep these: # "LDData Demo.ipynb", @@ -159,27 +170,118 @@ def main() -> None: # TypeError: HfApi.upload_folder() got an unexpected keyword argument 'timeout' # To remain compatible, call upload_folder without a `timeout` kwarg and # handle httpx.ReadTimeout explicitly. + + # Build list of candidate files and compute total size (exclude ignored) + def is_ignored(rel_path: str) -> bool: + # fnmatch against each ignore pattern; use posix style paths + for patt in ignore_patterns: + if fnmatch.fnmatch(rel_path, patt): + return True + if fnmatch.fnmatch("/" + rel_path, patt): + return True + return False + + files_to_upload = [] + total_size = 0 + for root, _, files in os.walk(local_path): + for fname in files: + full = Path(root) / fname + # Robust relative path computation: use os.path.relpath to avoid Path.relative_to errors + rel = os.path.relpath(str(full), start=str(local_path)) + # Normalize to posix separators and strip any leading './' or '/' + rel = rel.replace(os.sep, "/") + if rel.startswith("./"): + rel = rel[2:] + if rel.startswith("/"): + rel = rel.lstrip("/") + if is_ignored(rel): + continue + try: + sz = full.stat().st_size + except OSError: + sz = 0 + files_to_upload.append((full, rel)) + total_size += sz + + # Threshold to prefer upload_large_folder (50 MiB) + LARGE_THRESHOLD = 50 * 1024 * 1024 + try: - api.upload_folder( - folder_path=str(local_path), - repo_id=repo_id, - repo_type="dataset", - ignore_patterns=ignore_patterns, - ) + if total_size > LARGE_THRESHOLD and hasattr(api, "upload_large_folder"): + # Prefer API method designed for large folders when available. + try: + api.upload_large_folder( + folder_path=str(local_path), + repo_id=repo_id, + repo_type="dataset", + ignore_patterns=ignore_patterns, + token=token, + ) + except TypeError: + try: + api.upload_large_folder( + folder_path=str(local_path), + repo_id=repo_id, + repo_type="dataset", + token=token, + ) + except Exception as exc: + print() + print("ERROR: upload_large_folder failed:") + print(f" {exc!r}") + print("Falling back to per-file upload...") + raise + else: + api.upload_folder( + folder_path=str(local_path), + repo_id=repo_id, + repo_type="dataset", + ignore_patterns=ignore_patterns, + ) except httpx.ReadTimeout: print() print("ERROR: Upload read timed out.") print("Possible actions:") print(" - Check your network connection and try again.") print(" - Try uploading in smaller batches (split large files or directories).") + print(" - Use `HfApi().upload_large_folder(...)` or the CLI `hf upload-large-folder` if available.") print(" - Upgrade huggingface_hub to the latest version in case it adds improved timeout handling.") print(" - If you have very large files, consider using git-lfs or the web UI.") sys.exit(1) - except Exception as exc: + except TypeError as exc: print() - print("ERROR: Upload failed with an unexpected error:") + print("ERROR: Upload function raised a TypeError (likely a signature mismatch):") print(f" {exc!r}") - sys.exit(1) + print("Falling back to per-file upload...") + except Exception: + # If upload_large_folder/upload_folder raised but we want to fallback to per-file, continue below. + pass + + # Per-file upload fallback: ensure we use the normalized relative paths so subfolders are preserved. + if files_to_upload: + print("Falling back to per-file upload (this is slower but more robust for flaky networks)...") + for file_path, rel in files_to_upload: + # Ensure rel is relative and posix-normalized (it already is from above) + path_in_repo = rel.lstrip("/") # safety + success = False + try: + api.upload_file( + path_or_fileobj=str(file_path), + path_in_repo=path_in_repo, + repo_id=repo_id, + repo_type="dataset", + token=token, + ) + success = True + except httpx.ReadTimeout: + print() + print("ERROR: Per-file upload read timed out on:", path_in_repo) + print("You can retry this script or use `hf upload-large-folder` / `HfApi.upload_large_folder`.") + sys.exit(1) + except Exception as exc: + print(f"WARNING: Failed to upload {path_in_repo!s}: {exc!r}") + if not success: + print(f"Failed to upload: {path_in_repo}") dataset_url = f"https://huggingface.co/datasets/{repo_id}" print() From 7f79bc93923a2074f7c6dddf6747d7af3ebda6fe Mon Sep 17 00:00:00 2001 From: sou-cheng-choi Date: Thu, 20 Nov 2025 09:30:57 -0600 Subject: [PATCH 08/15] Add workflow --- .github/workflows/sync-to-huggingface.yml | 58 ++++++ scripts/git_lfs_upload.sh | 220 ++++++++++++++++++++++ upload.py | 128 +++++++++++-- 3 files changed, 388 insertions(+), 18 deletions(-) create mode 100644 .github/workflows/sync-to-huggingface.yml create mode 100644 scripts/git_lfs_upload.sh diff --git a/.github/workflows/sync-to-huggingface.yml b/.github/workflows/sync-to-huggingface.yml new file mode 100644 index 0000000..e4cc752 --- /dev/null +++ b/.github/workflows/sync-to-huggingface.yml @@ -0,0 +1,58 @@ +name: Sync to HuggingFace Dataset + +on: + push: + branches: + - main + - hug + paths: + - 'pregenerated_pointsets/**' + - 'dnet/**' + - 'lattice/**' + - LICENSE.txt + - LDData Demo.ipynb + - LD_DATA.md + - README.md + workflow_dispatch: # Allow manual triggering + +jobs: + sync-to-hf: + runs-on: ubuntu-latest + + steps: + - name: Checkout repository + uses: actions/checkout@v4 + with: + ref: main + fetch-depth: 1 + + - name: Set up Python + uses: actions/setup-python@v5 + with: + python-version: '3.11' + + - name: Install dependencies + run: | + python -m pip install --upgrade pip + pip install "huggingface_hub>=0.32.0" httpx + + - name: Upload to HuggingFace + env: + HF_TOKEN: ${{ secrets.HF_TOKEN }} + run: | + python upload.py \ + --repo-id QMCSoftware/LDData \ + --local-path . \ + --yes + + - name: Report status + if: success() + run: | + echo "✅ Successfully synchronized pregenerated_pointsets to HuggingFace dataset" + echo "Dataset URL: https://huggingface.co/datasets/QMCSoftware/LDData" + + - name: Report failure + if: failure() + run: | + echo "❌ Failed to synchronize to HuggingFace" + echo "Check the logs above for details" diff --git a/scripts/git_lfs_upload.sh b/scripts/git_lfs_upload.sh new file mode 100644 index 0000000..9e1acfd --- /dev/null +++ b/scripts/git_lfs_upload.sh @@ -0,0 +1,220 @@ +#!/usr/bin/env bash +# git_lfs_upload.sh +# +# Automate uploading folders/files to a Hugging Face dataset repo using git + git-lfs. +# This preserves folder structure, avoids API rate limits, and handles large files. +# +# Usage: +# 1) Make executable: +# chmod +x scripts/git_lfs_upload.sh +# 2) Run: +# ./scripts/git_lfs_upload.sh \ +# --repo-id QMCSoftware/LDData \ +# --local-path /Users/terrya/Documents/ProgramData/LDData \ +# --folders dnet,lattice,pregenerated_pointsets,README.md \ +# --branch main +# +# Environment notes: +# - If the repo is private, set HF_TOKEN in your environment or pass --token. +# If HF_TOKEN is present, the script will embed it into the clone URL for non-interactive auth. +# - Install prerequisites: git, git-lfs, rsync (macOS: `brew install git-lfs rsync`). +# - The script commits and pushes each folder separately to reduce the size of each push. + +set -euo pipefail +IFS=$'\n\t' + +REPO_ID="" +LOCAL_PATH="." +FOLDERS="" +BRANCH="main" +HF_TOKEN="${HF_TOKEN:-}" +CLONE_DIR="hf_repo" +EXCLUDES=() +LFS_PATTERNS=("*.bin" "*.zip" "*.tar" "*.tgz" "*.h5" "*.npy" "*.npz" "*.ckpt" "*.pt" "*.pth" "*.gz") + +print_usage() { + cat < Hugging Face dataset repo id (required) + --local-path Local LDData root (default: .) + --folders Comma-separated folders/files to upload (e.g. dnet,lattice,README.md) + --branch Git branch to push to (default: main) + --token Hugging Face token (or set HF_TOKEN env var) + --clone-dir Directory to clone the repo into (default: hf_repo) + --exclude Add an rsync exclude pattern (can be supplied multiple times) + -h, --help Show this help and exit + +Example: + $0 --repo-id QMCSoftware/LDData --local-path . --folders dnet,lattice,README.md + +This script: + - clones the target dataset repo, + - copies the requested folders/files into the clone (preserving structure), + - enables git-lfs for common large file types, + - commits and pushes each folder/file separately to reduce push sizes. + +EOF +} + +# Simple arg parsing +while [[ $# -gt 0 ]]; do + case "$1" in + --repo-id) + REPO_ID="$2"; shift 2;; + --local-path) + LOCAL_PATH="$2"; shift 2;; + --folders) + FOLDERS="$2"; shift 2;; + --branch) + BRANCH="$2"; shift 2;; + --token) + HF_TOKEN="$2"; shift 2;; + --clone-dir) + CLONE_DIR="$2"; shift 2;; + --exclude) + EXCLUDES+=("$2"); shift 2;; + -h|--help) + print_usage; exit 0;; + *) + echo "Unknown arg: $1" >&2; print_usage; exit 2;; + esac +done + +if [[ -z "$REPO_ID" ]]; then + echo "--repo-id is required" >&2 + print_usage + exit 2 +fi + +# Normalize LOCAL_PATH +LOCAL_PATH=$(cd "$LOCAL_PATH" && pwd) + +echo "Repo ID: $REPO_ID" +echo "Local path: $LOCAL_PATH" +echo "Folders: $FOLDERS" +echo "Branch: $BRANCH" + +# Check dependencies +command -v git >/dev/null 2>&1 || { echo "git not found; install git." >&2; exit 1; } +command -v rsync >/dev/null 2>&1 || { echo "rsync not found; install rsync." >&2; exit 1; } +if ! command -v git-lfs >/dev/null 2>&1; then + echo "git-lfs not found; installing is recommended. Please install git-lfs and run 'git lfs install'." >&2 + echo "On macOS: brew install git-lfs && git lfs install" >&2 + read -p "Continue without git-lfs? [y/N]: " c + if [[ "$c" != "y" && "$c" != "Y" ]]; then + exit 1 + fi +fi + +# Prepare clone URL +if [[ -n "$HF_TOKEN" ]]; then + # Embed token for non-interactive auth (note: exposing token in process list may be a security risk) + CLONE_URL="https://${HF_TOKEN}@huggingface.co/datasets/${REPO_ID}.git" +else + CLONE_URL="https://huggingface.co/datasets/${REPO_ID}.git" +fi + +# Clone the repo +if [[ -d "$CLONE_DIR" ]]; then + echo "Removing existing clone dir $CLONE_DIR" + rm -rf "$CLONE_DIR" +fi + +echo "Cloning ${CLONE_URL} -> ${CLONE_DIR}" +if ! git clone --depth 1 --branch "$BRANCH" "$CLONE_URL" "$CLONE_DIR"; then + echo "Initial clone failed; trying full clone (no depth)" + git clone --branch "$BRANCH" "$CLONE_URL" "$CLONE_DIR" +fi + +pushd "$CLONE_DIR" >/dev/null + +# Configure git user if not set +if ! git config user.email >/dev/null; then + git config user.email "uploader@example.com" +fi +if ! git config user.name >/dev/null; then + git config user.name "LDData uploader" +fi + +# Ensure branch exists locally +git checkout -B "$BRANCH" + +# Set up git-lfs patterns (only add if git-lfs available) +if command -v git-lfs >/dev/null 2>&1; then + echo "Configuring git-lfs patterns: ${LFS_PATTERNS[*]}" + for pat in "${LFS_PATTERNS[@]}"; do + git lfs track --no-update "$pat" || true + done + # Ensure .gitattributes is added + git add .gitattributes || true + git commit -m "Add git-lfs tracking patterns" --allow-empty || true +fi + +# Helper to build rsync exclude args +RSYNC_EXCLUDE_ARGS=() +for ex in "${EXCLUDES[@]}"; do + RSYNC_EXCLUDE_ARGS+=(--exclude "$ex") +done + +# Copy function: copy a single folder or file into the clone preserving path +copy_item() { + local item="$1" + echo "Processing: $item" + if [[ -d "$LOCAL_PATH/$item" ]]; then + mkdir -p "$(dirname "$item")" + rsync -av --delete "${RSYNC_EXCLUDE_ARGS[@]}" "$LOCAL_PATH/$item" ./ + elif [[ -f "$LOCAL_PATH/$item" ]]; then + mkdir -p "$(dirname "$item")" + rsync -av "${RSYNC_EXCLUDE_ARGS[@]}" "$LOCAL_PATH/$item" ./ + else + echo "Warning: $item not found in $LOCAL_PATH; skipping" + fi +} + +# Commit & push a path (folder or file) +commit_and_push() { + local path="$1" + git add --all "$path" || true + if git diff --staged --quiet; then + echo "No changes staged for $path" + return + fi + git commit -m "Upload $path" || true + echo "Pushing $path to origin/$BRANCH" + git push origin "$BRANCH" +} + +# If folders list is empty, upload whole workspace excluding excludes +if [[ -z "$FOLDERS" ]]; then + echo "No --folders provided; copying whole local tree (respecting excludes)." + rsync -av --delete "${RSYNC_EXCLUDE_ARGS[@]}" "$LOCAL_PATH/" ./ + commit_and_push "." +else + # iterate comma-separated list + IFS=',' read -ra ITEMS <<< "$FOLDERS" + for it in "${ITEMS[@]}"; do + it_trimmed=$(echo "$it" | sed 's/^\s*//;s/\s*$//') + if [[ -z "$it_trimmed" ]]; then + continue + fi + copy_item "$it_trimmed" + commit_and_push "$it_trimmed" + # pause between folder pushes to avoid network bursts + sleep 3 + done +fi + +# Final push of any remaining changes +git add --all || true +if ! git diff --staged --quiet; then + git commit -m "Upload remaining files" || true + git push origin "$BRANCH" +fi + +popd >/dev/null + +echo "Upload complete. Repository at: https://huggingface.co/datasets/${REPO_ID}" + +# End of script diff --git a/upload.py b/upload.py index b47a941..a332ae9 100644 --- a/upload.py +++ b/upload.py @@ -41,11 +41,14 @@ import argparse import os import sys +import time +import random from pathlib import Path import httpx import fnmatch from huggingface_hub import HfApi, create_repo # type: ignore +from huggingface_hub.errors import HfHubHTTPError # type: ignore def parse_args() -> argparse.Namespace: @@ -104,6 +107,77 @@ def get_token(cmd_token: str | None) -> str: return token +def retry_call( + fn, + *args, + retries: int = 6, + base_delay: float = 1.0, + max_delay: float = 60.0, + allowed_status_for_retry=(429, 500, 502, 503, 504), + **kwargs, +): + """ + Call `fn(*args, **kwargs)` with retries on rate limit (429), server errors and read timeouts. + Uses exponential backoff with jitter. If the server provides a Retry-After header it will be honored. + """ + last_exc = None + for attempt in range(retries): + try: + return fn(*args, **kwargs) + except HfHubHTTPError as exc: + last_exc = exc + # Try to extract status code and headers robustly + status_code = getattr(exc, "status_code", None) + headers = {} + resp = getattr(exc, "response", None) + if resp is not None: + try: + status_code = getattr(resp, "status_code", status_code) + headers = getattr(resp, "headers", {}) or {} + except Exception: + pass + if status_code in allowed_status_for_retry: + # Honor Retry-After if present + retry_after = None + for key in ("retry-after", "Retry-After"): + if key in headers: + retry_after = headers.get(key) + break + if retry_after is not None: + try: + delay = int(retry_after) + except Exception: + try: + delay = float(retry_after) + except Exception: + delay = None + else: + delay = min(max_delay, base_delay * (2 ** attempt)) + delay += random.uniform(0, base_delay) + # On last attempt, break and raise + if attempt == retries - 1: + break + time.sleep(delay) + continue + # Non-retryable HTTP error + raise + except httpx.ReadTimeout as exc: + last_exc = exc + if attempt == retries - 1: + break + delay = min(max_delay, base_delay * (2 ** attempt)) + random.uniform(0, base_delay) + time.sleep(delay) + continue + except Exception as exc: + # For other exceptions, do not retry except maybe transient httpx.NetworkError -- keep simple and re-raise + last_exc = exc + raise + # If we exit loop without returning, raise last exception + if last_exc is not None: + raise last_exc + raise RuntimeError("retry_call failed without exception") + + def main() -> None: args = parse_args() @@ -135,7 +209,8 @@ def main() -> None: # 1. Create (or reuse) the dataset repo on the Hub print(f"Creating (or reusing) dataset repo '{repo_id}' on the Hub...") - create_repo( + retry_call( + create_repo, repo_id=repo_id, repo_type="dataset", private=args.private, @@ -210,7 +285,8 @@ def is_ignored(rel_path: str) -> bool: if total_size > LARGE_THRESHOLD and hasattr(api, "upload_large_folder"): # Prefer API method designed for large folders when available. try: - api.upload_large_folder( + retry_call( + getattr(api, "upload_large_folder"), folder_path=str(local_path), repo_id=repo_id, repo_type="dataset", @@ -218,21 +294,23 @@ def is_ignored(rel_path: str) -> bool: token=token, ) except TypeError: - try: - api.upload_large_folder( - folder_path=str(local_path), - repo_id=repo_id, - repo_type="dataset", - token=token, - ) - except Exception as exc: - print() - print("ERROR: upload_large_folder failed:") - print(f" {exc!r}") - print("Falling back to per-file upload...") - raise + # Signature mismatch: try without ignore_patterns + retry_call( + getattr(api, "upload_large_folder"), + folder_path=str(local_path), + repo_id=repo_id, + repo_type="dataset", + token=token, + ) + except Exception as exc: + print() + print("ERROR: upload_large_folder failed:") + print(f" {exc!r}") + print("Falling back to per-file upload...") + # fall through to per-file upload else: - api.upload_folder( + retry_call( + getattr(api, "upload_folder"), folder_path=str(local_path), repo_id=repo_id, repo_type="dataset", @@ -253,6 +331,12 @@ def is_ignored(rel_path: str) -> bool: print("ERROR: Upload function raised a TypeError (likely a signature mismatch):") print(f" {exc!r}") print("Falling back to per-file upload...") + except HfHubHTTPError as exc: + print() + print("ERROR: HF Hub returned an HTTP error while attempting to upload:") + print(f" {exc!r}") + print("If this is a rate limit (429) you may need to wait, reduce request rate, or upgrade your plan.") + sys.exit(1) except Exception: # If upload_large_folder/upload_folder raised but we want to fallback to per-file, continue below. pass @@ -260,12 +344,13 @@ def is_ignored(rel_path: str) -> bool: # Per-file upload fallback: ensure we use the normalized relative paths so subfolders are preserved. if files_to_upload: print("Falling back to per-file upload (this is slower but more robust for flaky networks)...") - for file_path, rel in files_to_upload: + for idx, (file_path, rel) in enumerate(files_to_upload): # Ensure rel is relative and posix-normalized (it already is from above) path_in_repo = rel.lstrip("/") # safety success = False try: - api.upload_file( + retry_call( + getattr(api, "upload_file"), path_or_fileobj=str(file_path), path_in_repo=path_in_repo, repo_id=repo_id, @@ -278,10 +363,17 @@ def is_ignored(rel_path: str) -> bool: print("ERROR: Per-file upload read timed out on:", path_in_repo) print("You can retry this script or use `hf upload-large-folder` / `HfApi.upload_large_folder`.") sys.exit(1) + except HfHubHTTPError as exc: + # If we hit 429 here, retry_call would have already retried; if it still fails, surface a helpful message. + print() + print("ERROR: Failed to upload file due to HF Hub HTTP error:", path_in_repo) + print(f" {exc!r}") except Exception as exc: print(f"WARNING: Failed to upload {path_in_repo!s}: {exc!r}") if not success: print(f"Failed to upload: {path_in_repo}") + # Small pause between per-file uploads to avoid burst-rate limiting + time.sleep(0.1 + random.uniform(0, 0.05)) dataset_url = f"https://huggingface.co/datasets/{repo_id}" print() From 72386ac378d593b48e0c4f50173e2b3c29e2f4aa Mon Sep 17 00:00:00 2001 From: sou-cheng-choi Date: Thu, 20 Nov 2025 10:49:54 -0600 Subject: [PATCH 09/15] Update LD_DATA.md Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> --- LD_DATA.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/LD_DATA.md b/LD_DATA.md index da08e50..6ec52d7 100644 --- a/LD_DATA.md +++ b/LD_DATA.md @@ -242,7 +242,7 @@ For a `shiftmod1` in $s$ dimensions, the file will contain $s$ in the first line 3 # s = 3 dimensions 0.32638741823951621 0.91325392536931693 -0.1530364040t106301 +0.15303640401106301 ``` For a `dshift` with $r$ digits of accuracy in base $b$, in $s$ dimensions, the file will contain $b$ in the first line, $s$ in the second line, $r$ in the third line, and then $s$ integers from 0 to $b^r-1$, one per line. For the latter, the digits of the base $b$ representation of the integer divided by $b^r$ will be added modulo $b$ to the corresponding digits of the base $b$ representation of the coordinate. For example, if $b=2$ and $r=31$, the randomization makes a xor of the 31 bits of this integer with the 31 most significant bits of the corresponding coordinate of each point. From d1014fd7a01cbbe210e0467fd020d58b6fa8a57f Mon Sep 17 00:00:00 2001 From: sou-cheng-choi Date: Thu, 20 Nov 2025 11:09:56 -0600 Subject: [PATCH 10/15] Update .github/workflows/sync-to-huggingface.yml Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> --- .github/workflows/sync-to-huggingface.yml | 1 - 1 file changed, 1 deletion(-) diff --git a/.github/workflows/sync-to-huggingface.yml b/.github/workflows/sync-to-huggingface.yml index e4cc752..8a7c392 100644 --- a/.github/workflows/sync-to-huggingface.yml +++ b/.github/workflows/sync-to-huggingface.yml @@ -23,7 +23,6 @@ jobs: - name: Checkout repository uses: actions/checkout@v4 with: - ref: main fetch-depth: 1 - name: Set up Python From 38b111dceb2aa6c892dd15236a643e033da9c3b6 Mon Sep 17 00:00:00 2001 From: sou-cheng-choi Date: Thu, 20 Nov 2025 14:22:12 -0600 Subject: [PATCH 11/15] Add unit tests and CI tests --- .github/workflows/ci.yml | 45 ++++ .github/workflows/sync-to-huggingface.yml | 23 +- LDData/__init__.py | 40 ++++ PUBLISH2HF.md | 151 ++++++++++++ README.md | 4 +- env.yml | 1 + pyproject.toml | 16 ++ pytest.ini | 6 + scripts/git_lfs_upload.sh | 30 ++- tests/conftest.py | 6 + tests/test_hf_dataset.py | 16 ++ tests/test_integration_hf.py | 70 ++++++ tests/test_upload_unit.py | 96 ++++++++ upload.py | 265 +++++++++++----------- 14 files changed, 623 insertions(+), 146 deletions(-) create mode 100644 .github/workflows/ci.yml create mode 100644 LDData/__init__.py create mode 100644 PUBLISH2HF.md create mode 100644 pyproject.toml create mode 100644 pytest.ini create mode 100644 tests/conftest.py create mode 100644 tests/test_hf_dataset.py create mode 100644 tests/test_integration_hf.py create mode 100644 tests/test_upload_unit.py diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml new file mode 100644 index 0000000..039213d --- /dev/null +++ b/.github/workflows/ci.yml @@ -0,0 +1,45 @@ +name: CI - Tests + +on: + push: + branches: + - main + - hug + pull_request: + branches: + - main + - hug + +jobs: + tests: + runs-on: ubuntu-latest + + steps: + - name: Checkout repository + uses: actions/checkout@v4 + with: + fetch-depth: 1 + lfs: true + + - name: Ensure Git LFS + run: | + git lfs install --local || true + + - name: Setup conda environment from env.yml + uses: conda-incubator/setup-miniconda@v3 + with: + environment-file: env.yml + activate-environment: lddata + auto-update-conda: true + + - name: Install package into the conda env + shell: bash -l {0} + run: | + python -m pip install --upgrade pip + python -m pip install -e . + + - name: Run tests + env: + HF_TOKEN: ${{ secrets.HF_TOKEN }} + run: | + pytest diff --git a/.github/workflows/sync-to-huggingface.yml b/.github/workflows/sync-to-huggingface.yml index 8a7c392..3de7e04 100644 --- a/.github/workflows/sync-to-huggingface.yml +++ b/.github/workflows/sync-to-huggingface.yml @@ -24,16 +24,24 @@ jobs: uses: actions/checkout@v4 with: fetch-depth: 1 + lfs: true + + - name: Ensure Git LFS + run: | + git lfs install --local || true - - name: Set up Python - uses: actions/setup-python@v5 + - name: Setup conda environment from env.yml + uses: conda-incubator/setup-miniconda@v3 with: - python-version: '3.11' - - - name: Install dependencies + environment-file: env.yml + activate-environment: lddata + auto-update-conda: true + + - name: Install package into the conda env + shell: bash -l {0} run: | python -m pip install --upgrade pip - pip install "huggingface_hub>=0.32.0" httpx + python -m pip install -e . - name: Upload to HuggingFace env: @@ -41,8 +49,7 @@ jobs: run: | python upload.py \ --repo-id QMCSoftware/LDData \ - --local-path . \ - --yes + --local-path . - name: Report status if: success() diff --git a/LDData/__init__.py b/LDData/__init__.py new file mode 100644 index 0000000..233443c --- /dev/null +++ b/LDData/__init__.py @@ -0,0 +1,40 @@ +"""LDData package shim. + +This package file exposes the top-level `upload.py` module as +`LDData.upload` so tests and imports using `from LDData import upload` +work without moving the original script. +""" +from __future__ import annotations + +import importlib.util +from pathlib import Path +import sys + + +def _load_top_level_module(name: str, filename: Path): + spec = importlib.util.spec_from_file_location(name, str(filename)) + module = importlib.util.module_from_spec(spec) + loader = spec.loader + assert loader is not None + loader.exec_module(module) + return module + + +# Locate the repository root (parent of this package directory) +_repo_root = Path(__file__).resolve().parent.parent +# Path to the existing top-level upload.py +_upload_path = _repo_root / "upload.py" + +if _upload_path.exists(): + # Load the top-level upload.py as a module named 'LDData.upload' + _mod = _load_top_level_module("LDData.upload", _upload_path) + # Expose it in the package namespace + upload = _mod + __all__ = ["upload"] +else: + # Fallback: create a minimal stub so imports fail with clearer message later + def _missing(): + raise ImportError("upload.py not found at project root") + + upload = _missing + __all__ = ["upload"] diff --git a/PUBLISH2HF.md b/PUBLISH2HF.md new file mode 100644 index 0000000..a117ea8 --- /dev/null +++ b/PUBLISH2HF.md @@ -0,0 +1,151 @@ +# Publishing LDData to Hugging Face (upload.py) + +This document summarizes how to use `upload.py` to publish the `LDData` repository to the Hugging Face Datasets Hub, and provides notes for CI and secure authentication. + +## Requirements + +- Python 3.8+ +- `huggingface_hub>=0.32.0` (install with `pip install "huggingface_hub>=0.32.0"`) +- Network access to `huggingface.co` when uploading + +## Basic usage + +Run from the repository root (or pass `--local-path`): + +```bash +python upload.py \ + --repo-id QMCSoftware/LDData \ + --local-path . +``` + +This will create (or reuse) the dataset repo `QMCSoftware/LDData` on the Hub and upload files from the local checkout. + +## Important flags + +- `--repo-id`: target HF dataset id (default: `QMCSoftware/LDData`). +- `--local-path`: path to local LDData checkout (default: `.`). +- `--token`: HF token. If omitted, the script reads the `HF_TOKEN` environment variable. +- `--private`: create the dataset as private. +- `--dry-run`: don't upload; print what would be done. +- `--reset-remote`: delete the remote dataset repo on the Hub before uploading (destructive). +- `--yes`: skip interactive confirmation prompts (use with care, required for non-interactive CI with `--reset-remote`). + +Example (non-interactive destructive reset + upload): + +```bash +export HF_TOKEN="hf_xxx..." +python upload.py --repo-id QMCSoftware/LDData --local-path . --reset-remote --yes +``` + +## CI integration (GitHub Actions) + +Create a repository secret named `HF_TOKEN` containing a Hugging Face token with the required permissions. Example workflow step: + +```yaml +- name: Upload to HuggingFace + env: + HF_TOKEN: ${{ secrets.HF_TOKEN }} + run: | + python upload.py \ + --repo-id QMCSoftware/LDData \ + --local-path . \ + --reset-remote \ + --yes +``` + +Note: the repository included a workflow `.github/workflows/sync-to-huggingface.yml` that already calls `upload.py` for automated syncs. If you want to avoid destructive resets in CI, remove `--reset-remote --yes` from the workflow. + +## Local development (recommended) + +This repository includes an `env.yml` Conda environment for reproducible local development and for CI. Use it to create and activate the `lddata` environment, then install the package in editable mode: + +```bash +conda env create -f env.yml +conda activate lddata +python -m pip install --upgrade pip +python -m pip install -e . +``` + +Notes: +- CI uses `env.yml` (via `conda-incubator/setup-miniconda`) so the same environment is reproducible in GitHub Actions. +- `pip install -e .` uses the project's packaging (see `pyproject.toml`) so tests can import the `LDData` package. + +## Packaging (pyproject.toml) + +This repo uses a PEP 621 `pyproject.toml` with setuptools as the build backend. That lets `pip install -e .` (editable install) work consistently in both local and CI environments. + +If you prefer modern packaging workflows, keep `pyproject.toml` and use `pip` inside the conda env as shown above. + +## Run tests + +Unit tests are designed to be hermetic (they mock network calls). Integration tests that exercise the real Hugging Face API are gated and skipped by default. + +Run the full unit test suite: + +```bash +pytest -q +``` + +Run the destructive integration test (WARNING: deletes remote dataset) + +```bash +export HF_TOKEN=hf_xxx... # token with write/delete perms +export HF_INTEGRATION=1 +pytest -q tests/test_integration_hf.py +``` + +The integration test is intentionally skipped unless you set `HF_INTEGRATION=1` and provide `HF_TOKEN`. This prevents accidental destructive runs in CI or by contributors. + +## Quick checklist before publishing + +- Ensure `HF_TOKEN` (or SSH deploy key) is available and has write/delete permissions for the target dataset. +- Verify `upload.py` flags: use `--dry-run` to preview, and use `--reset-remote --yes` only when you intend to wipe the remote repo. +- Prefer SSH deploy keys for persistent automation where possible. + + +## Authentication & security + +Preferred options (ordered): + +1. **SSH deploy key** (recommended for long-lived automation) + - On Hugging Face dataset repo settings, add a *deploy key* with write access. + - Use the SSH clone URL `git@huggingface.co:datasets/ORG/REPO.git` in scripts or CI. + - Benefits: no tokens in environment, standard SSH key management. + +2. **CI secrets + `GIT_ASKPASS`** (used by `scripts/git_lfs_upload.sh`) + - Inject token into CI via secrets (e.g. `HF_TOKEN`) and use a temporary non-disclosable `GIT_ASKPASS` helper so the token is not visible in process listings. + - The repository's `scripts/git_lfs_upload.sh` uses this method to avoid embedding the token in the URL. + +3. **HF_TOKEN environment variable** passed directly to `upload.py` + - `upload.py` reads `HF_TOKEN` from the environment if `--token` is not provided. + - This is acceptable for CI (secrets are injected into the runner), but avoid printing the token or embedding it in command lines. + +Avoid embedding tokens in clone URLs (e.g. `https://hf_xxx@...`) because they appear in `ps` output, shell history and logs. + +## Safety notes about `--reset-remote` + +- `--reset-remote` deletes the remote dataset repository (destructive). Use `--yes` to skip confirmation in automation. +- If deletion fails (insufficient permissions or other errors), the script currently logs a warning and continues to (re)create the repo — change this behavior if you want strict failure. + +## Troubleshooting + +- If you see `ModuleNotFoundError: No module named 'huggingface_hub'`, install the dependency in the environment running the script: `pip install huggingface_hub`. +- For intermittent upload errors, the script uses retries and exponential backoff; ensure you have stable network connectivity. + +## Examples + +- Dry run to see what would happen: + ```bash + python upload.py --local-path . --dry-run + ``` + +- Upload privately without prompting: + ```bash + python upload.py --repo-id MyOrg/MyDataset --private --yes + ``` + +## Related scripts + +- `scripts/git_lfs_upload.sh` — alternate upload method that uses git + git-lfs and commits/pushes selected folders; supports `GIT_ASKPASS` when `HF_TOKEN` is set. +- `.github/workflows/sync-to-huggingface.yml` — example workflow that syncs selected paths to Hugging Face using `upload.py`. +- `.github/workflows/ci.yml` — main CI workflow that runs tests; does not upload by default. \ No newline at end of file diff --git a/README.md b/README.md index f5cbdff..f40a40a 100644 --- a/README.md +++ b/README.md @@ -223,7 +223,7 @@ This dataset incorporates formats and ideas from foundational work in QMC: If you use LDData in academic work, please cite: ``` -@article{sorokin.2025.ld_randomizations_ho_nets_fast_kernel_mats, +@article{sorokin2025, title = {{QMCPy}: a {P}ython software for randomized low-discrepancy sequences, quasi-{M}onte {C}arlo, and fast kernel methods}, author = {Aleksei G. Sorokin}, year = {2025}, @@ -233,7 +233,7 @@ If you use LDData in academic work, please cite: } -@inproceedings{choi.QMC_software, +@inproceedings{choi2022, title = {Quasi-{M}onte {C}arlo software}, author = {Choi, Sou-Cheng T. and Hickernell, Fred J. and Rathinavel, Jagadeeswaran and McCourt, Michael J. and Sorokin, Aleksei G.}, year = {2022}, diff --git a/env.yml b/env.yml index f108ca0..2eeb851 100644 --- a/env.yml +++ b/env.yml @@ -127,3 +127,4 @@ dependencies: - webencodings==0.5.1 - websocket-client==1.8.0 - huggingface_hub>=0.32.0 + - pytest==9.0.1 diff --git a/pyproject.toml b/pyproject.toml new file mode 100644 index 0000000..8d63b31 --- /dev/null +++ b/pyproject.toml @@ -0,0 +1,16 @@ +[build-system] +requires = ["setuptools>=61.0", "wheel"] +build-backend = "setuptools.build_meta" + +[project] +name = "LDData" +version = "0.0.0" +description = "LDData helper package for tests" +readme = "README.md" +requires-python = ">=3.11" +license = { text = "MIT" } +authors = [ { name = "QMCSoftware" } ] + +[tool.setuptools.packages.find] +where = ["."] +include = ["LDData"] diff --git a/pytest.ini b/pytest.ini new file mode 100644 index 0000000..20437c8 --- /dev/null +++ b/pytest.ini @@ -0,0 +1,6 @@ +[pytest] +minversion = 6.0 +addopts = -q -m "not integration" +testpaths = tests +markers = + integration: mark tests that require network access or are destructive to remote state \ No newline at end of file diff --git a/scripts/git_lfs_upload.sh b/scripts/git_lfs_upload.sh index 9e1acfd..2c576ef 100644 --- a/scripts/git_lfs_upload.sh +++ b/scripts/git_lfs_upload.sh @@ -108,12 +108,32 @@ if ! command -v git-lfs >/dev/null 2>&1; then fi fi -# Prepare clone URL +# Prepare clone URL (do NOT embed token in the URL) +CLONE_URL="https://huggingface.co/datasets/${REPO_ID}.git" + +# If HF_TOKEN is provided, use a temporary GIT_ASKPASS helper so the token +# is not exposed on the command line or process list. The helper prints the +# token when git prompts for a password. We keep the helper for the duration if [[ -n "$HF_TOKEN" ]]; then - # Embed token for non-interactive auth (note: exposing token in process list may be a security risk) - CLONE_URL="https://${HF_TOKEN}@huggingface.co/datasets/${REPO_ID}.git" -else - CLONE_URL="https://huggingface.co/datasets/${REPO_ID}.git" + ASKPASS_SCRIPT=$(mktemp -t hf_askpass.XXXXXX) + # Write an askpass helper that prints the token from the environment. + # Use a quoted heredoc so $HF_TOKEN is not expanded into the file. + cat > "$ASKPASS_SCRIPT" <<'ASKPASS_EOF' +#!/usr/bin/env sh +# GIT_ASKPASS helper: print HF_TOKEN from the environment (do not echo a newline) +printf "%s" "$HF_TOKEN" +ASKPASS_EOF + chmod 700 "$ASKPASS_SCRIPT" + export GIT_ASKPASS="$ASKPASS_SCRIPT" + # Prevent git from falling back to terminal prompting + export GIT_TERMINAL_PROMPT=0 + + cleanup_askpass() { + unset GIT_ASKPASS + unset GIT_TERMINAL_PROMPT + rm -f "$ASKPASS_SCRIPT" || true + } + trap cleanup_askpass EXIT fi # Clone the repo diff --git a/tests/conftest.py b/tests/conftest.py new file mode 100644 index 0000000..60c9fcc --- /dev/null +++ b/tests/conftest.py @@ -0,0 +1,6 @@ +import sys +from pathlib import Path + +# Ensure repository root is on sys.path so tests can import the local package +ROOT = Path(__file__).resolve().parents[1] +sys.path.insert(0, str(ROOT)) diff --git a/tests/test_hf_dataset.py b/tests/test_hf_dataset.py new file mode 100644 index 0000000..94dac2b --- /dev/null +++ b/tests/test_hf_dataset.py @@ -0,0 +1,16 @@ +from huggingface_hub import HfApi + + +def test_sou_cheng_lddata_exist_and_has_readme(): + """Fast smoke test: ensure the public dataset repo exists and includes a README.""" + api = HfApi() + files = api.list_repo_files("Sou-Cheng/LDData", repo_type="dataset") + + assert isinstance(files, list), "Expected list of files from Hugging Face" + assert len(files) > 0, "Dataset 'Sou-Cheng/LDData' appears to be empty or not accessible" + + # Check for README presence in a case-insensitive manner + lowered = [f.lower() for f in files] + assert any(name.endswith("readme.md") for name in lowered), ( + "Expected a README.md in the dataset files; got: " + ", ".join(files[:10]) + ) diff --git a/tests/test_integration_hf.py b/tests/test_integration_hf.py new file mode 100644 index 0000000..5059b08 --- /dev/null +++ b/tests/test_integration_hf.py @@ -0,0 +1,70 @@ +import os +import shutil +import tempfile +from pathlib import Path + +import pytest + +try: + from huggingface_hub import HfApi +except Exception: # pragma: no cover - if huggingface_hub missing, skip integration + HfApi = None + + +HF_INTEGRATION = os.environ.get("HF_INTEGRATION", "0").lower() in ("1", "true", "yes") +HF_TOKEN = os.environ.get("HF_TOKEN") +REPO_ID = os.environ.get("HF_INTEGRATION_REPO", "QMCSoftware/LDData") + + +@pytest.mark.skipif(not HF_INTEGRATION or HfApi is None, reason="Integration test disabled (set HF_INTEGRATION=1 and install huggingface_hub)") +def test_delete_and_upload_lattice_and_readme(): + """Integration test: delete remote dataset repo and upload only `lattice/` + `README.md`. + + WARNING: This test is destructive. It will delete the remote dataset repo + specified by `REPO_ID` and recreate it. Only enable it locally or in CI + when you explicitly want this behavior by setting `HF_INTEGRATION=1`. + """ + + if not HF_TOKEN: + pytest.skip("HF_TOKEN not provided; skipping destructive integration test") + + repo_root = Path(__file__).resolve().parents[2] + src_lattice = repo_root / "lattice" + src_readme = repo_root / "README.md" + + if not src_lattice.exists(): + pytest.skip("lattice/ folder not present in repo root; skipping") + + api = HfApi(token=HF_TOKEN) + + # Delete remote repo if it exists + try: + api.delete_repo(repo_id=REPO_ID, repo_type="dataset") + except Exception: + # ignore errors (repo may not exist or insufficient perms) + pass + + # Create the dataset repo (exist_ok=True will not fail if already present) + try: + api.create_repo(repo_id=REPO_ID, repo_type="dataset", exist_ok=True) + except Exception as exc: + pytest.fail(f"Failed to create dataset repo {REPO_ID}: {exc}") + + # Prepare a temporary folder containing only lattice/ and README.md + with tempfile.TemporaryDirectory() as td: + td_path = Path(td) + shutil.copytree(src_lattice, td_path / "lattice") + if src_readme.exists(): + shutil.copy(src_readme, td_path / "README.md") + + # Upload the folder contents to the dataset repo + try: + api.upload_folder(folder_path=str(td_path), repo_id=REPO_ID, repo_type="dataset") + except TypeError: + # Some versions of HfApi.upload_folder have a different signature + api.upload_folder(str(td_path), repo_id=REPO_ID, repo_type="dataset") + + # Verify that README.md exists in the remote dataset files + files = api.list_repo_files(REPO_ID, repo_type="dataset") + lowered = [f.lower() for f in files] + assert any(p.endswith("readme.md") for p in lowered), "README.md not found in uploaded dataset" diff --git a/tests/test_upload_unit.py b/tests/test_upload_unit.py new file mode 100644 index 0000000..c39c78d --- /dev/null +++ b/tests/test_upload_unit.py @@ -0,0 +1,96 @@ +import tempfile +import os +from pathlib import Path +import types + +import pytest + +from LDData import upload as up + + +class DummyAPI: + def __init__(self): + self.deleted = False + self.created = False + self.uploaded_folder = False + self.uploaded_large = False + self.uploaded_files = [] + + def delete_repo(self, **kwargs): + self.deleted = True + + def create_repo(self, **kwargs): + self.created = True + + def upload_folder(self, *args, **kwargs): + self.uploaded_folder = True + + def upload_large_folder(self, *args, **kwargs): + self.uploaded_large = True + + def upload_file(self, *args, **kwargs): + # record path_in_repo argument if present + if 'path_in_repo' in kwargs: + self.uploaded_files.append(kwargs['path_in_repo']) + else: + # some API variants use positional args; accept that case too + if len(args) >= 2: + self.uploaded_files.append(args[1]) + + +def test_delete_remote_repo_calls_delete_when_yes_true(): + api = DummyAPI() + # call with yes=True to avoid interactive prompt + up.delete_remote_repo(api, repo_id="owner/repo", token="tok", yes=True) + assert api.deleted is True + + +def test_perform_bulk_upload_uses_upload_folder(tmp_path, monkeypatch): + api = DummyAPI() + # create a small file so total_size < LARGE_THRESHOLD + d = tmp_path / "repo" + d.mkdir() + f = d / "a.txt" + f.write_text("hello") + + ok = up.perform_bulk_upload(api, local_path=d, repo_id="owner/repo", ignore_patterns=[], token="tok") + assert ok is True + assert api.uploaded_folder is True + + +def test_perform_bulk_upload_large_prefers_large_when_available(tmp_path): + api = DummyAPI() + # create a file > LARGE_THRESHOLD to trigger large upload path + d = tmp_path / "repo2" + d.mkdir() + big = d / "big.bin" + # write a file slightly larger than threshold (50 MiB) + big.write_bytes(b"0" * (50 * 1024 * 1024 + 10)) + + # monkeypatch attribute to simulate large upload existing + # DummyAPI already has upload_large_folder method + ok = up.perform_bulk_upload(api, local_path=d, repo_id="owner/repo", ignore_patterns=[], token="tok") + assert ok is True + assert api.uploaded_large is True + + +def test_perform_per_file_upload_calls_upload_file(tmp_path): + api = DummyAPI() + # create a couple of files list format expected by perform_per_file_upload: (fullpath, rel) + d = tmp_path / "repo3" + d.mkdir() + (d / "x.txt").write_text("x") + (d / "sub").mkdir() + (d / "sub" / "y.txt").write_text("y") + + files = [] + for root, _, fnames in os.walk(d): + for fn in fnames: + full = Path(root) / fn + rel = os.path.relpath(str(full), start=str(d)).replace(os.sep, '/') + files.append((full, rel)) + + up.perform_per_file_upload(api, files, repo_id="owner/repo", token="tok") + # uploaded_files should include both relative paths + assert any(p.endswith('x.txt') for p in api.uploaded_files) + assert any(p.endswith('sub/y.txt') for p in api.uploaded_files) diff --git a/upload.py b/upload.py index a332ae9..7c0deae 100644 --- a/upload.py +++ b/upload.py @@ -134,8 +134,9 @@ def retry_call( try: status_code = getattr(resp, "status_code", status_code) headers = getattr(resp, "headers", {}) or {} - except Exception: - pass + except Exception as exc: + # Log the exception for debugging purposes but continue execution + print(f"WARNING: An unexpected error occurred: {exc!r}") if status_code in allowed_status_for_retry: # Honor Retry-After if present retry_after = None @@ -178,92 +179,62 @@ def retry_call( raise RuntimeError("retry_call failed without exception") -def main() -> None: - args = parse_args() +def create_api(token: str) -> HfApi: + """Create an authenticated HfApi client.""" + return HfApi(token=token) - local_path = Path(args.local_path).expanduser().resolve() - if not local_path.exists(): - raise SystemExit(f"Local path does not exist: {local_path}") - # Sanity check: are we in LDData? - readme = local_path / "README.md" - if not readme.exists(): - print( - f"WARNING: {readme} does not exist. " - "Are you sure this is the LDData repo root?", - file=sys.stderr, +def delete_remote_repo(api: HfApi, repo_id: str, token: str, yes: bool) -> None: + """Delete the remote dataset repo if requested. Non-fatal on errors.""" + if not yes: + resp = input( + f"Are you sure you want to DELETE the dataset repo '{repo_id}' on Hugging Face? This is irreversible. Type 'yes' to continue: " ) + if resp.strip().lower() != "yes": + print("Aborting: remote reset cancelled by user.") + sys.exit(0) - token = get_token(args.token) - repo_id = args.repo_id - - print(f"Using repo_id: {repo_id}") - print(f"Local path : {local_path}") - print(f"Private : {args.private}") - if args.dry_run: - print("Dry run enabled: NOT creating or uploading, just showing intent.") - return + print(f"Deleting remote dataset repo '{repo_id}' (if it exists) on Hugging Face...") + try: + retry_call(getattr(api, "delete_repo"), repo_id=repo_id, repo_type="dataset", token=token) + print("Remote dataset repo deleted (or did not exist).") + except HfHubHTTPError as exc: + print() + print("WARNING: Failed to delete remote dataset repo:") + print(f" {exc!r}") + print("Continuing to (re)create the repository.") + except Exception as exc: + print() + print("WARNING: Unexpected error while attempting to delete remote repo:") + print(f" {exc!r}") + print("Continuing to (re)create the repository.") - # Initialize API client - api = HfApi(token=token) - # 1. Create (or reuse) the dataset repo on the Hub +def create_or_reuse_repo(repo_id: str, private: bool, token: str) -> None: + """Create (or reuse) the dataset repo on the Hub.""" print(f"Creating (or reusing) dataset repo '{repo_id}' on the Hub...") - retry_call( - create_repo, - repo_id=repo_id, - repo_type="dataset", - private=args.private, - exist_ok=True, - token=token, - ) + retry_call(create_repo, repo_id=repo_id, repo_type="dataset", private=private, exist_ok=True, token=token) - # 2. Upload folder contents - # We ignore some typical non-data files to keep the repo clean. - # Adjust this list if you want to exclude more or fewer things. - ignore_patterns = [ - "sc/*", - ".git/*", - ".gitignore", - ".DS_Store", - "__pycache__/*", - "*.pyc", - "*.pyo", - "*~", - "*.ipynb_checkpoints*", - "raw.githubusercontent.com" - # If you do NOT want to upload the demo notebook or env file, keep these: - # "LDData Demo.ipynb", - # "env.yml", - ] - - print("Uploading local folder to the Hub (this may take a while)...") - - # NOTE: - # Older/newer versions of huggingface_hub's HfApi.upload_folder do not accept - # a `timeout` keyword argument. Passing it raises: - # TypeError: HfApi.upload_folder() got an unexpected keyword argument 'timeout' - # To remain compatible, call upload_folder without a `timeout` kwarg and - # handle httpx.ReadTimeout explicitly. - # Build list of candidate files and compute total size (exclude ignored) +def build_ignore_checker(ignore_patterns): def is_ignored(rel_path: str) -> bool: - # fnmatch against each ignore pattern; use posix style paths for patt in ignore_patterns: - if fnmatch.fnmatch(rel_path, patt): - return True - if fnmatch.fnmatch("/" + rel_path, patt): + if fnmatch.fnmatch(rel_path, patt) or fnmatch.fnmatch("/" + rel_path, patt): return True return False + return is_ignored + + +def gather_files(local_path: Path, ignore_patterns): + """Return a list of (full_path, rel_posix_path) and total size, skipping ignore patterns.""" + is_ignored = build_ignore_checker(ignore_patterns) files_to_upload = [] total_size = 0 for root, _, files in os.walk(local_path): for fname in files: full = Path(root) / fname - # Robust relative path computation: use os.path.relpath to avoid Path.relative_to errors rel = os.path.relpath(str(full), start=str(local_path)) - # Normalize to posix separators and strip any leading './' or '/' rel = rel.replace(os.sep, "/") if rel.startswith("./"): rel = rel[2:] @@ -277,45 +248,31 @@ def is_ignored(rel_path: str) -> bool: sz = 0 files_to_upload.append((full, rel)) total_size += sz + return files_to_upload, total_size - # Threshold to prefer upload_large_folder (50 MiB) - LARGE_THRESHOLD = 50 * 1024 * 1024 +def perform_bulk_upload(api: HfApi, local_path: Path, repo_id: str, ignore_patterns, token: str) -> bool: + """Attempt bulk upload via upload_large_folder or upload_folder. Returns True on success.""" + LARGE_THRESHOLD = 50 * 1024 * 1024 + # prefer upload_large_folder if folder is large and API provides it + files_to_upload, total_size = gather_files(local_path, ignore_patterns) try: if total_size > LARGE_THRESHOLD and hasattr(api, "upload_large_folder"): - # Prefer API method designed for large folders when available. try: - retry_call( - getattr(api, "upload_large_folder"), - folder_path=str(local_path), - repo_id=repo_id, - repo_type="dataset", - ignore_patterns=ignore_patterns, - token=token, - ) + retry_call(getattr(api, "upload_large_folder"), folder_path=str(local_path), repo_id=repo_id, repo_type="dataset", ignore_patterns=ignore_patterns, token=token) + return True except TypeError: - # Signature mismatch: try without ignore_patterns - retry_call( - getattr(api, "upload_large_folder"), - folder_path=str(local_path), - repo_id=repo_id, - repo_type="dataset", - token=token, - ) + retry_call(getattr(api, "upload_large_folder"), folder_path=str(local_path), repo_id=repo_id, repo_type="dataset", token=token) + return True except Exception as exc: print() print("ERROR: upload_large_folder failed:") print(f" {exc!r}") print("Falling back to per-file upload...") - # fall through to per-file upload + return False else: - retry_call( - getattr(api, "upload_folder"), - folder_path=str(local_path), - repo_id=repo_id, - repo_type="dataset", - ignore_patterns=ignore_patterns, - ) + retry_call(getattr(api, "upload_folder"), folder_path=str(local_path), repo_id=repo_id, repo_type="dataset", ignore_patterns=ignore_patterns) + return True except httpx.ReadTimeout: print() print("ERROR: Upload read timed out.") @@ -331,6 +288,7 @@ def is_ignored(rel_path: str) -> bool: print("ERROR: Upload function raised a TypeError (likely a signature mismatch):") print(f" {exc!r}") print("Falling back to per-file upload...") + return False except HfHubHTTPError as exc: print() print("ERROR: HF Hub returned an HTTP error while attempting to upload:") @@ -338,42 +296,87 @@ def is_ignored(rel_path: str) -> bool: print("If this is a rate limit (429) you may need to wait, reduce request rate, or upgrade your plan.") sys.exit(1) except Exception: - # If upload_large_folder/upload_folder raised but we want to fallback to per-file, continue below. - pass - - # Per-file upload fallback: ensure we use the normalized relative paths so subfolders are preserved. - if files_to_upload: - print("Falling back to per-file upload (this is slower but more robust for flaky networks)...") - for idx, (file_path, rel) in enumerate(files_to_upload): - # Ensure rel is relative and posix-normalized (it already is from above) - path_in_repo = rel.lstrip("/") # safety - success = False - try: - retry_call( - getattr(api, "upload_file"), - path_or_fileobj=str(file_path), - path_in_repo=path_in_repo, - repo_id=repo_id, - repo_type="dataset", - token=token, - ) - success = True - except httpx.ReadTimeout: - print() - print("ERROR: Per-file upload read timed out on:", path_in_repo) - print("You can retry this script or use `hf upload-large-folder` / `HfApi.upload_large_folder`.") - sys.exit(1) - except HfHubHTTPError as exc: - # If we hit 429 here, retry_call would have already retried; if it still fails, surface a helpful message. - print() - print("ERROR: Failed to upload file due to HF Hub HTTP error:", path_in_repo) - print(f" {exc!r}") - except Exception as exc: - print(f"WARNING: Failed to upload {path_in_repo!s}: {exc!r}") - if not success: - print(f"Failed to upload: {path_in_repo}") - # Small pause between per-file uploads to avoid burst-rate limiting - time.sleep(0.1 + random.uniform(0, 0.05)) + return False + + +def perform_per_file_upload(api: HfApi, files_to_upload, repo_id: str, token: str) -> None: + """Upload files one-by-one as a fallback.""" + if not files_to_upload: + return + print("Falling back to per-file upload (this is slower but more robust for flaky networks)...") + for idx, (file_path, rel) in enumerate(files_to_upload): + path_in_repo = rel.lstrip("/") + success = False + try: + retry_call(getattr(api, "upload_file"), path_or_fileobj=str(file_path), path_in_repo=path_in_repo, repo_id=repo_id, repo_type="dataset", token=token) + success = True + except httpx.ReadTimeout: + print() + print("ERROR: Per-file upload read timed out on:", path_in_repo) + print("You can retry this script or use `hf upload-large-folder` / `HfApi.upload_large_folder`.") + sys.exit(1) + except HfHubHTTPError as exc: + print() + print("ERROR: Failed to upload file due to HF Hub HTTP error:", path_in_repo) + print(f" {exc!r}") + except Exception as exc: + print(f"WARNING: Failed to upload {path_in_repo!s}: {exc!r}") + if not success: + print(f"Failed to upload: {path_in_repo}") + time.sleep(0.1 + random.uniform(0, 0.05)) + + +def main() -> None: + args = parse_args() + + local_path = Path(args.local_path).expanduser().resolve() + if not local_path.exists(): + raise SystemExit(f"Local path does not exist: {local_path}") + + readme = local_path / "README.md" + if not readme.exists(): + print(f"WARNING: {readme} does not exist. Are you sure this is the LDData repo root?", file=sys.stderr) + + token = get_token(args.token) + repo_id = args.repo_id + + print(f"Using repo_id: {repo_id}") + print(f"Local path : {local_path}") + print(f"Private : {args.private}") + if args.dry_run: + print("Dry run enabled: NOT creating or uploading, just showing intent.") + return + + api = create_api(token) + + if args.reset_remote: + delete_remote_repo(api, repo_id, token, args.yes) + + create_or_reuse_repo(repo_id, args.private, token) + + ignore_patterns = [ + "sc/*", + ".git/*", + ".gitignore", + ".DS_Store", + "__pycache__/*", + "*.pyc", + "*.pyo", + "*~", + "*.ipynb_checkpoints*", + "raw.githubusercontent.com", + ] + + print("Uploading local folder to the Hub (this may take a while)...") + + # gather files in case we need per-file fallback + files_to_upload, _ = gather_files(local_path, ignore_patterns) + + bulk_ok = perform_bulk_upload(api, local_path, repo_id, ignore_patterns, token) + if bulk_ok: + print("Bulk upload succeeded — skipping per-file fallback.") + else: + perform_per_file_upload(api, files_to_upload, repo_id, token) dataset_url = f"https://huggingface.co/datasets/{repo_id}" print() From 030e8d7628a2703fe59511f8936193f951585008 Mon Sep 17 00:00:00 2001 From: sou-cheng-choi Date: Thu, 20 Nov 2025 14:23:08 -0600 Subject: [PATCH 12/15] Update git_lfs_upload.sh --- scripts/git_lfs_upload.sh | 1 + 1 file changed, 1 insertion(+) diff --git a/scripts/git_lfs_upload.sh b/scripts/git_lfs_upload.sh index 2c576ef..44488a3 100644 --- a/scripts/git_lfs_upload.sh +++ b/scripts/git_lfs_upload.sh @@ -114,6 +114,7 @@ CLONE_URL="https://huggingface.co/datasets/${REPO_ID}.git" # If HF_TOKEN is provided, use a temporary GIT_ASKPASS helper so the token # is not exposed on the command line or process list. The helper prints the # token when git prompts for a password. We keep the helper for the duration +# of the script (so pushes work) and remove it on exit. if [[ -n "$HF_TOKEN" ]]; then ASKPASS_SCRIPT=$(mktemp -t hf_askpass.XXXXXX) # Write an askpass helper that prints the token from the environment. From 18c2ca2fec74c21dcdd623ab915d4be22023cc4a Mon Sep 17 00:00:00 2001 From: sou-cheng-choi Date: Thu, 20 Nov 2025 14:27:48 -0600 Subject: [PATCH 13/15] Update ci.yml --- .github/workflows/ci.yml | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 039213d..8be045d 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -37,9 +37,10 @@ jobs: run: | python -m pip install --upgrade pip python -m pip install -e . + python -m pip install pytest - name: Run tests env: HF_TOKEN: ${{ secrets.HF_TOKEN }} run: | - pytest + python -m pytest -q \ No newline at end of file From 31865bfb5f9c78a75f1c1ac31fa2fedaad0de8f5 Mon Sep 17 00:00:00 2001 From: sou-cheng-choi Date: Thu, 20 Nov 2025 14:33:23 -0600 Subject: [PATCH 14/15] Update ci.yml --- .github/workflows/ci.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 8be045d..bd12803 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -35,6 +35,7 @@ jobs: - name: Install package into the conda env shell: bash -l {0} run: | + conda activate lddata python -m pip install --upgrade pip python -m pip install -e . python -m pip install pytest From 2fecace67a005ad177ebff6d722ae68de2b8d660 Mon Sep 17 00:00:00 2001 From: sou-cheng-choi Date: Thu, 20 Nov 2025 14:39:06 -0600 Subject: [PATCH 15/15] Fix CI Test failure --- .github/workflows/ci.yml | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index bd12803..59e658b 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -43,5 +43,7 @@ jobs: - name: Run tests env: HF_TOKEN: ${{ secrets.HF_TOKEN }} + shell: bash -l {0} run: | - python -m pytest -q \ No newline at end of file + conda activate lddata + python -m pytest -q \ No newline at end of file