diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml new file mode 100644 index 0000000..59e658b --- /dev/null +++ b/.github/workflows/ci.yml @@ -0,0 +1,49 @@ +name: CI - Tests + +on: + push: + branches: + - main + - hug + pull_request: + branches: + - main + - hug + +jobs: + tests: + runs-on: ubuntu-latest + + steps: + - name: Checkout repository + uses: actions/checkout@v4 + with: + fetch-depth: 1 + lfs: true + + - name: Ensure Git LFS + run: | + git lfs install --local || true + + - name: Setup conda environment from env.yml + uses: conda-incubator/setup-miniconda@v3 + with: + environment-file: env.yml + activate-environment: lddata + auto-update-conda: true + + - name: Install package into the conda env + shell: bash -l {0} + run: | + conda activate lddata + python -m pip install --upgrade pip + python -m pip install -e . + python -m pip install pytest + + - name: Run tests + env: + HF_TOKEN: ${{ secrets.HF_TOKEN }} + shell: bash -l {0} + run: | + conda activate lddata + python -m pytest -q \ No newline at end of file diff --git a/.github/workflows/sync-to-huggingface.yml b/.github/workflows/sync-to-huggingface.yml new file mode 100644 index 0000000..3de7e04 --- /dev/null +++ b/.github/workflows/sync-to-huggingface.yml @@ -0,0 +1,64 @@ +name: Sync to HuggingFace Dataset + +on: + push: + branches: + - main + - hug + paths: + - 'pregenerated_pointsets/**' + - 'dnet/**' + - 'lattice/**' + - LICENSE.txt + - LDData Demo.ipynb + - LD_DATA.md + - README.md + workflow_dispatch: # Allow manual triggering + +jobs: + sync-to-hf: + runs-on: ubuntu-latest + + steps: + - name: Checkout repository + uses: actions/checkout@v4 + with: + fetch-depth: 1 + lfs: true + + - name: Ensure Git LFS + run: | + git lfs install --local || true + + - name: Setup conda environment from env.yml + uses: conda-incubator/setup-miniconda@v3 + with: + environment-file: env.yml + activate-environment: lddata + auto-update-conda: true + + - name: Install package into the conda env + shell: bash -l {0} + run: | + python -m pip install --upgrade pip + python -m pip install -e . + + - name: Upload to HuggingFace + env: + HF_TOKEN: ${{ secrets.HF_TOKEN }} + run: | + python upload.py \ + --repo-id QMCSoftware/LDData \ + --local-path . + + - name: Report status + if: success() + run: | + echo "✅ Successfully synchronized pregenerated_pointsets to HuggingFace dataset" + echo "Dataset URL: https://huggingface.co/datasets/QMCSoftware/LDData" + + - name: Report failure + if: failure() + run: | + echo "❌ Failed to synchronize to HuggingFace" + echo "Check the logs above for details" diff --git a/.gitignore b/.gitignore index 0eff7ac..57816f9 100644 --- a/.gitignore +++ b/.gitignore @@ -1,4 +1,7 @@ _ags/* *.DS_Store raw.githubusercontent.com/* -*.ipynb-checkpoints \ No newline at end of file +*.ipynb-checkpoints +/sc +.vscode/settings.json +*.pyc diff --git a/LDData Demo.ipynb b/LDData Demo.ipynb index f63d34c..7e24b2f 100644 --- a/LDData Demo.ipynb +++ b/LDData Demo.ipynb @@ -62,8 +62,8 @@ "name": "stderr", "output_type": "stream", "text": [ - "/Users/agsorok/miniconda3/envs/lddata/lib/python3.13/site-packages/qmcpy/discrete_distribution/lattice/lattice.py:257\n", - "\tParameterWarning: Non-randomized lattice sequence includes the origin\n" + "/Users/terrya/miniconda3/envs/lddata/lib/python3.13/site-packages/qmcpy/discrete_distribution/lattice/lattice.py:248: ParameterWarning: Without randomization, the first lattice point is the origin\n", + " warnings.warn(\"Without randomization, the first lattice point is the origin\",ParameterWarning)\n" ] }, { @@ -130,8 +130,8 @@ "name": "stderr", "output_type": "stream", "text": [ - "/Users/agsorok/miniconda3/envs/lddata/lib/python3.13/site-packages/qmcpy/discrete_distribution/digital_net_b2/digital_net_b2.py:389\n", - "\tParameterWarning: Non-randomized DigitalNetB2 sequence includes the origin\n" + "/Users/terrya/miniconda3/envs/lddata/lib/python3.13/site-packages/qmcpy/discrete_distribution/digital_net_b2/digital_net_b2.py:421: ParameterWarning: Without randomization, the first digtial net point is the origin\n", + " warnings.warn(\"Without randomization, the first digtial net point is the origin\",ParameterWarning)\n" ] }, { @@ -302,14 +302,6 @@ "generators = [qp.DigitalNetB2(d,randomize=False,generating_matrices=file) for file in files]\n", "plot_extensible_projections(generators,files,n=n)" ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "b8ef5511", - "metadata": {}, - "outputs": [], - "source": [] } ], "metadata": { diff --git a/LDData/__init__.py b/LDData/__init__.py new file mode 100644 index 0000000..233443c --- /dev/null +++ b/LDData/__init__.py @@ -0,0 +1,40 @@ +"""LDData package shim. + +This package file exposes the top-level `upload.py` module as +`LDData.upload` so tests and imports using `from LDData import upload` +work without moving the original script. +""" +from __future__ import annotations + +import importlib.util +from pathlib import Path +import sys + + +def _load_top_level_module(name: str, filename: Path): + spec = importlib.util.spec_from_file_location(name, str(filename)) + module = importlib.util.module_from_spec(spec) + loader = spec.loader + assert loader is not None + loader.exec_module(module) + return module + + +# Locate the repository root (parent of this package directory) +_repo_root = Path(__file__).resolve().parent.parent +# Path to the existing top-level upload.py +_upload_path = _repo_root / "upload.py" + +if _upload_path.exists(): + # Load the top-level upload.py as a module named 'LDData.upload' + _mod = _load_top_level_module("LDData.upload", _upload_path) + # Expose it in the package namespace + upload = _mod + __all__ = ["upload"] +else: + # Fallback: create a minimal stub so imports fail with clearer message later + def _missing(): + raise ImportError("upload.py not found at project root") + + upload = _missing + __all__ = ["upload"] diff --git a/LD_DATA.md b/LD_DATA.md new file mode 100644 index 0000000..6ec52d7 --- /dev/null +++ b/LD_DATA.md @@ -0,0 +1,315 @@ +# Low Discrepancy Data + +Low discrepancy generating vectors and matrices. + +## Softwares + +- [LatNet Builder](https://github.com/umontreal-simul/latnetbuilder) +- [Magic Point Shop](https://people.cs.kuleuven.be/~dirk.nuyens/qmc-generators/) + +## General ideas and goals + +We propose *standard formats* to specify lattice rules, polynomial lattice rules, and digital nets, in simple text files. We want the formats to be simple and relatively compact, with no more than one line per dimension, +so they can easily be used for point sets in several thousand dimensions if desired. Ordinary text files with decimal numbers are good enough. They are easy to read by both humans and computers in any language. Other specialized formats (Json or Parquet, for example) can be more compact but then +the files are not as easy to read without extra tools. + +Joe and Kuo (2008) provide a file for up to 21201 dimensions for Sobol' points, and this seems to be the most widely used set of parameters for RQMC points at this time. They use a fixed and very simple format, which requires no special software to read. We want to provide similar types of files for other types of point sets, for an arbitrarily large number of dimensions. The SSJ simulation library (L’Ecuyer, 2016) can already read some of our proposed formats. Latnet Builder (L’Ecuyer et al., 2022) can (or will) produce output files in these formats. The choice of output format for Latnet Builder can be specified on the command line, using the following keywords: + +- `lattice` +A lattice rule: give the modulus and the generating vector. +- `dnet` +A digital net: give the generating matrices, one per line. +- `plattice` +A polynomial lattice rule: give the polynomial modulus and the generating vector. +- `sobol` +Sobol' points (default format), give only the direction numbers. +- `soboljk` +Sobol' points, the format used by Joe and Kuo (2008). + + +The most important formats are the first two, since the point sets covered by the other formats are special cases of digital nets, so they can all be described by the `dnet` format. We propose them because they provide alternative representations that are either more compact or commonly used. + +All the point sets that we consider have the form + +$$P_n = \{\boldsymbol{u}_i \in [0,1)^s,\; i=0,\dots,n-1\}$$ + +where $n$ is the number of points and $s$ is the number of dimensions. The dimension $j$ goes from 1 to $s$ and there are $n$ points enumerated by $i$ going from 0 to $n-1$. (In computer code, $j$ usually starts at 0, whereas in math papers, it starts at 1; one must be careful about this discrepancy.) The .txt files that contain the parameters have one line per dimension, preceded by a few lines that contain general parameters, such as $s$, $n$, etc. We shall call these lines the *header* of the file. In the header, additional lines that start with `#` can be used for comments and descriptions; these lines are totally optional and should be just skipped by the program that reads the file. Anything that starts with `#` on any given line in the header should also be skipped. All these comments are only for human readers to better see what is in the file, they are not for the computer[^1]. One exception: the first line of the file must be a comment that contains the keyword for the file type; for example `dnet` for a digital net. The number of dimensions (number of lines after the header) can be much larger than what we usually need; it suffices to use the number of rows that are needed. + +[^1]: Comments are now allowed only in the header lines, + not in the $s$ lines that follow. This makes more sense. + +The point sets can be extensible in the number of points $n$ or not (they can be constructed for a single $n$ only). Sobol points are extensible ad infinitum, although they are very good only when $n$ is a power of 2. Other types of point sets can also also be extensible, but are usually constructed to be good only for $n = b^k$ for $k$ in a given integer range, e.g., from 10 to 20, where $b \ge 2$ is the base. They satisfy the property that $P_{b^k}$ contains $P_{b^{k-1}}$ for all $k$ in this range. We call them *embedded* point sets. For these types of point sets it is highly recommended to specify the range in a comment in the header of the file [^2]. + +[^2]: The range for which the points were built could be given in the file, but this makes things a bit more complicated for some point sets. For example, for ordinary lattice rules with a prime number of points, this additional info might be confusing for some users. For Sobol points, the range has no limit. + +In the proposed formats, the files do not assume a given computer word size (e.g., 32 bits or 64 bits). The format is exactly the same regardless of the word size. Of course, if the file contains integers of more than 32 bits, the corresponding points cannot be generated properly on a 32 bit computer. A comment in the file header can say it. + +Some users might prefer input files with no header at all, only the $s$ lines that give the generating vector or generating matrices. In some languages (e.g., MATLAB), such a file can be read into a matrix by a simple "load file" command, so there is no need to to write any code to read the file. Users who want that can simply strip out the header from the files in standard format and use these naked files privately. We think that the header with human-readable comments as imposed by the standard will be very useful to many users. + +The following sections describe the proposed text-file formats for the different point sets. + + +## Types of point sets and notation + + +### Parameters for ordinary lattice rules: `lattice` + +For an ordinary *lattice rule of rank 1*, we have + +$$P_n = \{\boldsymbol{u}_i = (i \boldsymbol{a} \bmod n)/n,\; i=0,\dots,n-1\}$$ + +where $\boldsymbol{a} = (a_1,\dots,a_s)$ is the generating vector. We must specify $s$, $n$, and $\boldsymbol{a}$. + +In a `lattice` file, the first line must start with `# lattice`. After that, not counting the comment lines, the first line gives the number $s$ of dimensions, the second line gives the number $n$ of points, and lines 3 to $s+2$ give the coefficients $a_1,\dots, a_s$, one value per line. In the case of embedded lattice rules, $n$ would usually be a power of 2, say $n=2^k$, and the smaller embedded lattices will contain the first $2^{k-1}$ points, the first $2^{k-2}$ points, etc. +Additional comments in the file should tell when the lattice is embedded, which figure of merit and what weights were used, the construction method, etc. [^3] + +[^3]: Should we be forced to always put the embedding range in the file for the computer to read? My suggestion is not to force the computer to read it, but just put it as a comment for humans, Otherwise, it will force an additional line that gives the base and the range. What would we put for example if $n$ is prime and the rule is not embedded? Should we have a `lattice2` format for embedded rules in base 2? Putting more options makes things more complicated. + +One example of a parameter file for an ordinary lattice rule, in `lattice` format is given below. In this file, the first line is skipped, only the number `8` is read on the second line, only the number `65536` is read on the second line, etc. + +``` +# lattice +# A lattice rule, non-embedded, in 'lattice' format +8 # 8 dimensions +65536 # modulus = n = 65536 points +# coordinates of the generating vector, starting at j=1: +1 +19463 +17213 +5895 +14865 +31925 +30921 +26671 +``` + +### Parameters for digital nets: `dnet` + +A *digital net in base* $b$ with $n=b^k$ points is defined by selecting integers $s \geq 1$, $r \geq k \geq 1$, and $s$ matrices $\boldsymbol{C}_1,\dots,\boldsymbol{C}_s$ of size $r \times k$ with entries in $\mathbb{Z}_b$, called the generating matrices. For $i=0,\dots,n-1$, let $i = \sum_{\ell=0}^{k-1} a_{i,\ell} b^\ell$ be the expansion of $i$ in base $b$, and for $j=1,\dots s$, let + +$$(y_{i, j, 1}, \dots, y_{i, j, r})^T = \boldsymbol{C}_j \cdot (a_{i, 0}, \dots, a_{i, k-1})^T$$ + +and + +$$u_{i, j} = \sum_{\ell=1}^{r} y_{i, j, \ell} b^{-\ell}.$$ + +The points $\boldsymbol{u}_i$ are defined by $\boldsymbol{u}_i = (u_{i,1},\dots,u_{i,s})$. Digital nets are usually in base $b=2$, but we allow a general (typically prime) base $b \ge 2$. + +The proposed format to specify digital nets is as follows. The first line must start with `# dnet`. Then the first four non-comment lines give $b$ (the base), $s$ (the number of dimensions), $k$ (the number of columns), and $r$ (the number of rows in the generating matrices in base $b$). Thus, the output values will have "precision" $b^{-r}$ (they will be integer multiples of $b^{-r}$). For $b=2$, a common value in the past has been $r=31$ when using 32 bit integers, but going forward we should use 64 bit integers and $r=63$ or 64, or perhaps $r=53$ to exploit the full accuracy of a `double`. By looking at $r$, one can see right away whether this file is good for 64 bit computers only or for 32 bit computers as well. + +The $s$ lines after this header will contain the $s$ generating matrices, one per line. Each of these lines contains $k$ integers smaller than $b^r$ giving the $k$ columns of $\boldsymbol{C}_j$, using by default the same encoding as in the class `DigitalNetBase2` in SSJ for $b=2$. That is, the base $b$ representation of the integer gives the $r$ digits in the corresponding column, with the digit on the first row of the matrix (row 0) being the most significant, and the one on the last row (row $r-1$) being the least significant. For example, if $b=2$, $r=31$, and the first column has a 1 in the first row and 0 in all other rows, +as is always the case for Sobol points, then the integer representation of this column will be $2^{30} = 1\,073\,741\,824$. If there is a 1 in the last row and 0 elsewhere, the representation will be $2^0 = 1$. If all 31 elements of the column are 1, the representation will be $2^{31}-1$. + +One example of a file for a digital net in `dnet` format: + +``` +# dnet +# A digital net in base 2, in 'dnet' format +2 # basis b = 2 +8 # s = 8 dimensions +10 # k = 10, so n = 2^10 = 1024 points +31 # r = 31 digits +# The columns of gen. matrices C_1, ..., C_s, one matrix per line: +1073741824 536870912 268435456 134217728 ... +2012537125 1382645254 ... +... +``` + +This differs from Joe and Kuo (2008), where the $c$-th column (for $c\ge 1$) is represented by an integer smaller than $2^c$ (in base 2) and the least significant bit is the one on the diagonal. Their representation works when $\boldsymbol{C}_j$ is upper triangular, which is true for Sobol point sets, but not for digital nets in general. + +Recall that coordinate $j$ of the $i$-th point is obtained by multiplying the base $b$ matrix $\boldsymbol{C}_j$ by the vector of digits of the representation of $i$ in base $b$, with the least significant digits of $i$ at the top. That is, the least significant digit of $i$ goes with the first column of $\boldsymbol{C}_j$. And the first row of $\boldsymbol{C}_j$ is for the most significant digit of output. With our representation of $\boldsymbol{C}_j$ by $k$ integers, the points are easy and fast to generate in base 2. We obtain `u[i,j]`, coordinate $j$ of point $i$, with the following code snippet, in which `C[j,c]` is the integer that represents column $c$ of $\boldsymbol{C}_j$: + +```python +normFactor = 1.0 / (1 << r) # 2^(-r) +coord = 0 +for c in range(k): + coord ^= ((i >> c) & 1) * C[j,c] + u[i,j] = coord * normFactor +``` + + +### Parameters for polynomial lattice rules: `plattice` + +*Polynomial lattice rules* are a special type of digital nets with generating matrices of a special form. For a polynomial lattice rule of rank 1 in a prime base $b$, we have + +$$P_n = \left\{\left(\varphi\left(\frac{h(z) a_1(z)}{Q(z)}\right), \dots, \varphi\left(\frac{h(z) a_s(z)}{Q(z)}\right)\right) : h(z)\in\mathbb{F}_b[z], \ \text{degree}(h(z)) < k\right\}.$$ + +where $\mathbb{F}_b[z]$ is the space of polynomials with coefficients in $\mathbb{F}_b$, the *modulus* $Q(z) \in \mathbb{F}_b[z]$ is a polynomial of degree $k$, the *generating vector* $\boldsymbol{a}(z) = (a_1(z),\dots,a_s(z))\in \mathbb{F}_b[z]^s$ is a vector of $s$ polynomials of degrees less than $k$, and the mapping $\varphi$ is defined by + +$$\varphi\left(\sum_{l = w}^{\infty} x_l z^{-l}\right) = \sum_{l = \max(w, 1)}^{\infty} x_l b^{-l}.$$ + +This point set has $n = b^k$ points. + +We must specify the polynomial modulus $Q(z)$ and the polynomial generating vector $\boldsymbol{a}(z)$. The polynomial modulus will be represented as an integer that has $(k+1)$ digits in base $b$, and all the other polynomials will be represented as integers that have no more than $k$ digits in base $b$. All these integers will be given in base 10 in the file, one per line. In practice, we usually have $b=2$, so $k$ represents the number of bits. The integer that represents a polynomial is obtained simply by replacing the formal variable by $b$. For example, if the polynomial is $Q(z) = z^4 + z^3 + 1$ and $b=2$, its coefficients are "1 1 0 0 1" and its integer representation is $2^4 + 2^3 + 1 = 25$. This is the usual representation, as used in Goda and Dick (2015), for example. In the case of embedded point sets, the modulus should be $Q(z) = z^k$ for $n=b^k$ points, and its integer representation is $b^k$. In particular, $Q(z) = z$ is represented by the integer $b$. + +As usual, the first line is a comment that tells the type of file. Then the first four non-comment lines give the base $b$, the number $s$ of dimensions, the degree $k$ of the polynomial modulus, and the integer representation of this polynomial. Lines 5 to $s+4$ give the polynomials that form the generating vector, one per line, using the integer representation just explained. One example of a file for a polynomial lattice in the `plattice` format: + +``` +# plattice +# A polynomial lattice rule in base 2, in 'plattice' format +2 # base b = 2 +8 # s = 8 dimensions +16 # n = 2^16 = 65536 points +45781 # polynomial modulus +# coordinates of the generating vector, starting at j=1: +1 +17213 +5895 +14865 +31925 +30921 +26671 +17213 +``` + +A polynomial lattice rule in base $b$ can also be represented as a digital net in base $b$, so its parameters can also be provided in a file in the `dnet` format, as for general digital net in base $b$. But the generating matrices have a special form and the above representation is much more compact (a single integer per row instead of $k$ integers per row). On the other hand, generating the points is faster with the generating matrices than with the polynomial representation, so the software that will use the `plattice` files and generate the points would usually first convert the polynomials into the corresponding generating matrices. LatNet Builder (L’Ecuyer et al., 2022) is also able to make the conversion and produce a file in the `dnet` format, for more convenience and better flexibility, so the user can select the format she/he prefers. + +### Parameters for Sobol nets: `sobol` and `soboljk` + +The Sobol' construction provides another special case of digital nets (and sequences), in base 2. They are defined in many places, including Joe and Kuo (2008). For each coordinate $j$, we select a primitive polynomial $p_j(z)$ of degree $c_j$, and $c_j$ integers $m_{j,1},\dots,m_{j,c_j}$ which are used to define the generating matrix $\boldsymbol{C}_j$. The real numbers $2^{-c} m_{j,c}$ are called the +initial *direction numbers*. More details are given in Joe and Kuo (2008) and [here](http://umontreal-simul.github.io/ssj/docs/master/classumontreal_1_1ssj_1_1hups_1_1SobolSequence.html). + +One obvious option for these point sets is to adopt exactly the same format as Joe and Kuo (2008), because it is already used in many places. The only difference is that we now allow comment lines in the file. In the format of Joe and Kuo (2008), only the first line is skipped. In the proposed format, other comment lines can be added at the beginning of the file, e.g., to give the maximum number of dimensions in the file, the criterion and weights that were used, etc. +Note that Sobol' sequences have an infinite number of points and an unlimited number of dimensions, although the file will give parameters for a finite number of dimensions. + +The other lines of the file specify the primitive polynomials and the initial direction numbers for each dimension $j \ge 2$, one line per dimension. For dimension $j=1$, the generating matrix is the identity and is not given in the file (it is implicit). The columns of this matrix are not obtained via a recurrence based on a primitive polynomial, +so this matrix is handled separately. + +The first number on each line is the dimension $j$. +The second number is the degree $c_j$ of the primitive polynomial $p_j(x)$ used for this dimension. The third number is the integer that corresponds to the binary representation of the inner coefficients of this polynomial (we ignore the first and last coefficients, they are always 1). For example, if the polynomial is $p_j(x) = x^4 + x^3 + 1$, the coefficients are "1 1 0 0 1", and after removing the first and last "1", we get 100 in base 2, which is 4, so the third column would contain the number 4. (Without removing the first and last "1", the number would be 25 instead.) After these three numbers, there are $c_j$ integers $m_{j,1},\dots,m_{j,c_j}$ where $m_{j,c}$ is the $c^\text{th}$ (real-valued) initial direction number for this coordinate, multiplied by $2^c$ to obtain an integer. This $m_{j,c}$ is the integer formed by taking the bits in row 1 to row $c$ of column $c$, in this order. The last bit is the bit on the diagonal, which is always 1, so all $m_{j,c}$'s are odd integers. I think this format comes from Bratley and Fox (1988). + +We denote this format for Sobol parameters by the `soboljk` keyword. One example of a file in this format is shown below. The first line gives the type of file and the next three lines are comments that must be skipped by the reading program. + +``` +# soboljk +# Parameters for Sobol points, in 'soboljk' format +# 8 dimensions +# c_j p_j m_{j,c} +2 1 0 1 +3 2 1 1 3 +4 3 1 1 3 1 +5 3 2 1 1 1 +6 4 1 1 1 3 3 +7 4 4 1 3 5 13 +8 5 2 1 1 5 5 17 +``` + +The `soboljk` format can be simplified as follows. First, removing the first and last "1" in the representation of the primitive polynomials saves a bit of memory, but it also makes thinks slightly more complicated. In the default representations of the primitive polynomials in the code that generates the points, these bits are usually not removed. In SSJ, the first thing we do when reading a file in `soboljk` format is to add them back. Also, the primitive polynomials can be in a separate file, since they never change, and only the (initial) direction numbers (those depend on the selected FOM and weights) would be given to specify the Sobol' points. That is, we remove the first three columns of the `soboljk` format. The Magic Point Shop (Nuyens, 2020) also produces files that contain only the direction numbers. + +One example of a file in this `sobol` format: + +``` +# sobol +# Parameters m_{j,c} for Sobol points, in 'sobol' format +# 8 dimensions +1 # This is m_{j,c} for the second coordinate +1 3 +1 3 1 +1 1 1 +1 1 3 3 +1 3 5 13 +1 1 5 5 17 +``` + +A list of the first few primitive polynomials in base 2 is given [here](https://mathworld.wolfram.com/PrimitivePolynomial.html). If we *do not* remove the first and last 1's in their representations, the first primitive polynomials are: `3, 7, 11, 13, 19, 25, ...`. Their degrees are `1, 2, 3, 3, 4, 4, ...`. This representation is the one used in the code of SSJ, for example. We can have a separate file that gives these polynomials, one per line, exactly as in the first three columns of the `soboljk` format. We may also want to remove the first column. + +Another, perhaps more convenient, way of storing Sobol' constructions is to just use the general `dnet` format, in which the generating matrices are given explicitly. This `dnet` format is easier to use. On the other hand, it requires specifying a (maximum) value of $k$, and $k$ integers per row to specify the generating matrices, which leads to larger files. From a file in `sobol` format, one can construct a digital net with an arbitrarily large $k$. + +When $n = 2^k$ is fixed, so we use exactly $n = 2^k$ points and there is there no embedding, we can add one *extra dimension* at the beginning by using the reflected identity as a generating matrix. The successive values for this coordinate will then be $0, 1/n, 2/n, 3/n, \dots$ in this order. This matrix will not be given in the file for Sobol' points; the QMC/RQMC software must handle it. For lattice rules and general digital nets with fixed $n$ (non-embedded), the file could give a first coordinate with this behavior. + +## Files that contain randomizations + +The idea of proposing a format for storing specific randomizations was suggested by Fred Hickernell. This can be useful for verification purposes, for example. + +We can store randomizations in the following file formats: + +- `shiftmod1` +A (random) shift modulo 1. It corresponds to a single point in $[0,1)^s$. +- `dshift` +A digital shift in base $b$. +Also a single point in $[0,1)^s$, but with $r$ digits in base $b$. +- `nuscramble` +A nested uniform scramble in base $b$. +- `lmscramble` +A (linear) left matrix scramble in base $b$. + + +For a `shiftmod1` in $s$ dimensions, the file will contain $s$ in the first line, followed by $s$ real numbers between 0 and 1, one per line. + +``` +# shiftmod1 +# A shift modulo 1, in 'shiftmod1' format +3 # s = 3 dimensions +0.32638741823951621 +0.91325392536931693 +0.15303640401106301 +``` + +For a `dshift` with $r$ digits of accuracy in base $b$, in $s$ dimensions, the file will contain $b$ in the first line, $s$ in the second line, $r$ in the third line, and then $s$ integers from 0 to $b^r-1$, one per line. For the latter, the digits of the base $b$ representation of the integer divided by $b^r$ will be added modulo $b$ to the corresponding digits of the base $b$ representation of the coordinate. For example, if $b=2$ and $r=31$, the randomization makes a xor of the 31 bits of this integer with the 31 most significant bits of the corresponding coordinate of each point. + +``` +# dshift +# A digital shift in base 2, in 'dshift' format +2 # b = 2 +3 # s = 3 +31 # r = 31 +2146832861 +1084390381 +963462828 +``` + +For a `lmscramble` with $r$ digits of accuracy, for $b^k$ points in base $b$ in $s$ dimensions, we need to store $s$ lower-triangular invertible $r\times r$ matrices with entries in $\{0,\dots,b-1\}$. For $b=2$, each matrix must have only 1's on the diagonal and 0's above the diagonal. Each such matrix can be stored in one line of the file, in exactly the same format as the generating matrices in the `dnet` format, using one integer for each column. We want them in this format for the fast LMS implementation we have in SSJ, for example. The file will contain $b$ in the first non-comment line, $s$ in the second line, $r$ in the third line, +and then $s$ square lower-triangular and invertible $r\times r$ matrices, one per line, with each column represented as an integer as in the `dnet` format. Thus, each scrambling matrix is represented by $r$ integers on the same line. +Here is an example, + +``` +# lmscramble +# A left matrix scramble in base 2, with 31 digits of resolution. +2 # basis b = 2 +8 # s = 8 dimensions +31 # r = 31 digits +# The columns of the lower-triangular r x r scrambling matrices, one matrix per line: +1673741824 906870912 615843556 213427728 ... +2012537125 1012645254 ... +... +``` + +For a `nuscramble` of the first $r \ge k$ digits, for $n=b^k$ points in base $b$ in $s$ dimensions, with the implementation proposed in Section 3 of Friedel and Keller (2002) and used for $b=2$ in class `DigitalNetBase2` of SSJ, we need $sn$ blocks of $r$ random digits in base $b$. Each such block can be represented as an integer in the range $\{0,1,\dots,b^r-1\}$. For $b=2$, these are $r$ bit integers. We can store these integers one row per dimension, $n$ integers per row. This gives the following `nuscramble` file format. The first non-comment line contains the base $b$, the second line gives the number $s$ of dimensions, the third line gives the scramble resolution (the number of digits that are scrambled), and the following $s$ lines give the $sn$ integers used for the scrambling, $n$ integers per line. Note that this is the same amount of random numbers that we would need if we use plain Monte Carlo instead of RQMC[^4][^5]. + +[^4]: Another way of storing the NUS is as follows. For each coordinate $j$, each point can be identified by a $k$ bit integer, and the NUS maps each such $k$ to a $r$ bit integer that corresponds to the scrambled coordinate $j$ of this point. So we can simply store this map in an array of size $b^k$ whose entry $i$ contains the corresponding $r$ bit integer. Applying this NUS is then fast and straightforward. + +[^5]: Alternative implementations of NUS that use a hashing function in place of a RNG are proposed in Burley (2020) and Laine and Karras (2011). These methods might be faster and the is much less information to store to reproduce a given scramble, but the hashing function must be fixed, known, and reliable. This essentially amount to fixing the RNG and storing only its seed. + +``` +# nuscramble +# A nested uniform scramble in base 2, with 30 bits of resolution. +2 # basis b = 2 +8 # s = 8 dimensions +10 # k = 10, so n = 2^10 = 1024 points +30 # r = 30 digits +# The following s rows contain n = 1024 30 bit integers per row: +1173741824 906870912 615843556 213427728 ... +1012537125 1001975254 ... +... +``` + +## File names and other recommendations + +It is strongly recommend that all file names start with the corresponding keyword, like `plattice` for a polynomial lattice rule, `sobol` for a Sobol point set, and `lmscramble` for a left matrix scramble, for example. + +It is also recommended to put enough relevant comments in each file for a knowledgeable human to find what the file is for (type of point set, figure of merit and weights that were used to construct it, range of values of $n$ for embedded point sets, etc.). + + +We also want some unit tests: some specific parameter files together with the correct output that should be observed when generating the points from these files. + +## References + +- P. Bratley and B. L. Fox. Algorithm 659: Implementing Sobol’s quasirandom sequence generator. ACM Transactions on Mathematical Software, 14(1):88–100, 1988. +- Brent Burley. Practical hash-based Owen scrambling. The Journal of Computer Graphics Techniques, 9(4):1–20, 2020. +- I. Friedel and A. Keller. Fast generation of randomized low-discrepancy point sets. In K.-T. Fang, F. J. Hickernell, and H. Niederreiter, editors, Monte Carlo and Quasi-Monte Carlo Methods 2000, pages 257–273, Berlin, 2002. Springer-Verlag. +- T. Goda and J. Dick. Construction of interlaced scrambled polynomial lattice rules of arbitrary high order. Foundation of Computational Mathematics, 15:1245–1278, 2015. +- S. Joe and F. Y. Kuo. Constructing Sobol sequences with better two-dimensional projections. SIAM Journal on Scientific Computing, 30(5):2635–2654, 2008. +- S. Laine and T. Karras. Stratified sampling for stochastic transparency. Computer Graphics Forum, 30(4):1197–1204, 2011. +- P. L’Ecuyer. SSJ: Stochastic simulation in Java. http://simul.iro.umontreal.ca/ssj/, accessed 9th August 2021, 2016. +- P. L’Ecuyer, P. Marion, M. Godin, and F. Puchhammer. A tool for custom construction of QMC and RQMC point sets. In A. Keller, editor, Monte Carlo and Quasi-Monte Carlo Methods: MCQMC 2020, pages 51–70, Berlin, 2022. Springer. https://arxiv.org/abs/2012.10263. +- D. Nuyens. The magic point shop, 2020. https://people.cs.kuleuven.be/~dirk.nuyens/qmc-generators/. \ No newline at end of file diff --git a/LICENSE.txt b/LICENSE.txt new file mode 100644 index 0000000..f67ae09 --- /dev/null +++ b/LICENSE.txt @@ -0,0 +1,13 @@ +Copyright [2021] [Illinois Institute of Technology] + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. diff --git a/PUBLISH2HF.md b/PUBLISH2HF.md new file mode 100644 index 0000000..a117ea8 --- /dev/null +++ b/PUBLISH2HF.md @@ -0,0 +1,151 @@ +# Publishing LDData to Hugging Face (upload.py) + +This document summarizes how to use `upload.py` to publish the `LDData` repository to the Hugging Face Datasets Hub, and provides notes for CI and secure authentication. + +## Requirements + +- Python 3.8+ +- `huggingface_hub>=0.32.0` (install with `pip install "huggingface_hub>=0.32.0"`) +- Network access to `huggingface.co` when uploading + +## Basic usage + +Run from the repository root (or pass `--local-path`): + +```bash +python upload.py \ + --repo-id QMCSoftware/LDData \ + --local-path . +``` + +This will create (or reuse) the dataset repo `QMCSoftware/LDData` on the Hub and upload files from the local checkout. + +## Important flags + +- `--repo-id`: target HF dataset id (default: `QMCSoftware/LDData`). +- `--local-path`: path to local LDData checkout (default: `.`). +- `--token`: HF token. If omitted, the script reads the `HF_TOKEN` environment variable. +- `--private`: create the dataset as private. +- `--dry-run`: don't upload; print what would be done. +- `--reset-remote`: delete the remote dataset repo on the Hub before uploading (destructive). +- `--yes`: skip interactive confirmation prompts (use with care, required for non-interactive CI with `--reset-remote`). + +Example (non-interactive destructive reset + upload): + +```bash +export HF_TOKEN="hf_xxx..." +python upload.py --repo-id QMCSoftware/LDData --local-path . --reset-remote --yes +``` + +## CI integration (GitHub Actions) + +Create a repository secret named `HF_TOKEN` containing a Hugging Face token with the required permissions. Example workflow step: + +```yaml +- name: Upload to HuggingFace + env: + HF_TOKEN: ${{ secrets.HF_TOKEN }} + run: | + python upload.py \ + --repo-id QMCSoftware/LDData \ + --local-path . \ + --reset-remote \ + --yes +``` + +Note: the repository included a workflow `.github/workflows/sync-to-huggingface.yml` that already calls `upload.py` for automated syncs. If you want to avoid destructive resets in CI, remove `--reset-remote --yes` from the workflow. + +## Local development (recommended) + +This repository includes an `env.yml` Conda environment for reproducible local development and for CI. Use it to create and activate the `lddata` environment, then install the package in editable mode: + +```bash +conda env create -f env.yml +conda activate lddata +python -m pip install --upgrade pip +python -m pip install -e . +``` + +Notes: +- CI uses `env.yml` (via `conda-incubator/setup-miniconda`) so the same environment is reproducible in GitHub Actions. +- `pip install -e .` uses the project's packaging (see `pyproject.toml`) so tests can import the `LDData` package. + +## Packaging (pyproject.toml) + +This repo uses a PEP 621 `pyproject.toml` with setuptools as the build backend. That lets `pip install -e .` (editable install) work consistently in both local and CI environments. + +If you prefer modern packaging workflows, keep `pyproject.toml` and use `pip` inside the conda env as shown above. + +## Run tests + +Unit tests are designed to be hermetic (they mock network calls). Integration tests that exercise the real Hugging Face API are gated and skipped by default. + +Run the full unit test suite: + +```bash +pytest -q +``` + +Run the destructive integration test (WARNING: deletes remote dataset) + +```bash +export HF_TOKEN=hf_xxx... # token with write/delete perms +export HF_INTEGRATION=1 +pytest -q tests/test_integration_hf.py +``` + +The integration test is intentionally skipped unless you set `HF_INTEGRATION=1` and provide `HF_TOKEN`. This prevents accidental destructive runs in CI or by contributors. + +## Quick checklist before publishing + +- Ensure `HF_TOKEN` (or SSH deploy key) is available and has write/delete permissions for the target dataset. +- Verify `upload.py` flags: use `--dry-run` to preview, and use `--reset-remote --yes` only when you intend to wipe the remote repo. +- Prefer SSH deploy keys for persistent automation where possible. + + +## Authentication & security + +Preferred options (ordered): + +1. **SSH deploy key** (recommended for long-lived automation) + - On Hugging Face dataset repo settings, add a *deploy key* with write access. + - Use the SSH clone URL `git@huggingface.co:datasets/ORG/REPO.git` in scripts or CI. + - Benefits: no tokens in environment, standard SSH key management. + +2. **CI secrets + `GIT_ASKPASS`** (used by `scripts/git_lfs_upload.sh`) + - Inject token into CI via secrets (e.g. `HF_TOKEN`) and use a temporary non-disclosable `GIT_ASKPASS` helper so the token is not visible in process listings. + - The repository's `scripts/git_lfs_upload.sh` uses this method to avoid embedding the token in the URL. + +3. **HF_TOKEN environment variable** passed directly to `upload.py` + - `upload.py` reads `HF_TOKEN` from the environment if `--token` is not provided. + - This is acceptable for CI (secrets are injected into the runner), but avoid printing the token or embedding it in command lines. + +Avoid embedding tokens in clone URLs (e.g. `https://hf_xxx@...`) because they appear in `ps` output, shell history and logs. + +## Safety notes about `--reset-remote` + +- `--reset-remote` deletes the remote dataset repository (destructive). Use `--yes` to skip confirmation in automation. +- If deletion fails (insufficient permissions or other errors), the script currently logs a warning and continues to (re)create the repo — change this behavior if you want strict failure. + +## Troubleshooting + +- If you see `ModuleNotFoundError: No module named 'huggingface_hub'`, install the dependency in the environment running the script: `pip install huggingface_hub`. +- For intermittent upload errors, the script uses retries and exponential backoff; ensure you have stable network connectivity. + +## Examples + +- Dry run to see what would happen: + ```bash + python upload.py --local-path . --dry-run + ``` + +- Upload privately without prompting: + ```bash + python upload.py --repo-id MyOrg/MyDataset --private --yes + ``` + +## Related scripts + +- `scripts/git_lfs_upload.sh` — alternate upload method that uses git + git-lfs and commits/pushes selected folders; supports `GIT_ASKPASS` when `HF_TOKEN` is set. +- `.github/workflows/sync-to-huggingface.yml` — example workflow that syncs selected paths to Hugging Face using `upload.py`. +- `.github/workflows/ci.yml` — main CI workflow that runs tests; does not upload by default. \ No newline at end of file diff --git a/README.md b/README.md index e19149d..f40a40a 100644 --- a/README.md +++ b/README.md @@ -1,315 +1,266 @@ -# Low Discrepancy Data +# LDData: Low-Discrepancy Generating Vectors and Matrices -Low discrepancy generating vectors and matrices. +A curated collection of **low-discrepancy point set parameters** including **lattice rules**, **digital nets**, **polynomial lattice rules**, **Sobol' nets**, and **RQMC randomizations**. This dataset enables reproducible research and high-performance Quasi–Monte Carlo (QMC) and Randomized QMC (RQMC) simulation. -## Softwares +The [LDData repository](https://github.com/QMCSoftware/LDData) provides **standard text-based formats** for specifying structures used in QMC point generation across arbitrarily high dimensions. -- [LatNet Builder](https://github.com/umontreal-simul/latnetbuilder) -- [Magic Point Shop](https://people.cs.kuleuven.be/~dirk.nuyens/qmc-generators/) +--- -## General ideas and goals +## Dataset Summary -We propose *standard formats* to specify lattice rules, polynomial lattice rules, and digital nets, in simple text files. We want the formats to be simple and relatively compact, with no more than one line per dimension, -so they can easily be used for point sets in several thousand dimensions if desired. Ordinary text files with decimal numbers are good enough. They are easy to read by both humans and computers in any language. Other specialized formats (Json or Parquet, for example) can be more compact but then -the files are not as easy to read without extra tools. +LDData is a dataset of structured parameter files defining: -Joe and Kuo (2008) provide a file for up to 21201 dimensions for Sobol' points, and this seems to be the most widely used set of parameters for RQMC points at this time. They use a fixed and very simple format, which requires no special software to read. We want to provide similar types of files for other types of point sets, for an arbitrarily large number of dimensions. The SSJ simulation library (L’Ecuyer, 2016) can already read some of our proposed formats. Latnet Builder (L’Ecuyer et al., 2022) can (or will) produce output files in these formats. The choice of output format for Latnet Builder can be specified on the command line, using the following keywords: +- Rank-1 **lattice rules** +- Base-$b$ **digital nets** +- **Polynomial lattice rules** +- **Sobol' and Sobol–Joe-Kuo sequences** +- Various **randomizations** (shift modulo 1, digital shifts, nested uniform + scrambles, left matrix scrambles) -- `lattice` -A lattice rule: give the modulus and the generating vector. -- `dnet` -A digital net: give the generating matrices, one per line. -- `plattice` -A polynomial lattice rule: give the polynomial modulus and the generating vector. -- `sobol` -Sobol' points (default format), give only the direction numbers. -- `soboljk` -Sobol' points, the format used by Joe and Kuo (2008). +Each file type follows a simple textual standard to ensure: +- Human readability +- Language-agnostic parsing +- Long-term reproducibility +- Extensibility to thousands of dimensions +The dataset is motivated by the need for **standardized, compact, transparent formats** in [our QMC research and software](https://github.com/QMCSoftware). -The most important formats are the first two, since the point sets covered by the other formats are special cases of digital nets, so they can all be described by the `dnet` format. We propose them because they provide alternative representations that are either more compact or commonly used. +--- -All the point sets that we consider have the form +## Motivation -$$P_n = \{\boldsymbol{u}_i \in [0,1)^s,\; i=0,\dots,n-1\}$$ +Many QMC constructions appear across scattered software packages, papers, or custom formats. LDData brings these formats together into a **consistent, unified, machine-readable** repository for: -where $n$ is the number of points and $s$ is the number of dimensions. The dimension $j$ goes from 1 to $s$ and there are $n$ points enumerated by $i$ going from 0 to $n-1$. (In computer code, $j$ usually starts at 0, whereas in math papers, it starts at 1; one must be careful about this discrepancy.) The .txt files that contain the parameters have one line per dimension, preceded by a few lines that contain general parameters, such as $s$, $n$, etc. We shall call these lines the *header* of the file. In the header, additional lines that start with `#` can be used for comments and descriptions; these lines are totally optional and should be just skipped by the program that reads the file. Anything that starts with `#` on any given line in the header should also be skipped. All these comments are only for human readers to better see what is in the file, they are not for the computer[^1]. One exception: the first line of the file must be a comment that contains the keyword for the file type; for example `dnet` for a digital net. The number of dimensions (number of lines after the header) can be much larger than what we usually need; it suffices to use the number of rows that are needed. +- Researchers developing new QMC methods +- Practitioners needing high-dimensional low-discrepancy point sets +- Developers of simulation libraries such as SSJ, QMCPy, and LatNet Builder -[^1]: Comments are now allowed only in the header lines, - not in the $s$ lines that follow. This makes more sense. +This dataset is linked to the research works described in the Citation section below. **For detailed technical specifications and implementation details**, see +[LD_DATA.md](LD_DATA.md) -The point sets can be extensible in the number of points $n$ or not (they can be constructed for a single $n$ only). Sobol points are extensible ad infinitum, although they are very good only when $n$ is a power of 2. Other types of point sets can also also be extensible, but are usually constructed to be good only for $n = b^k$ for $k$ in a given integer range, e.g., from 10 to 20, where $b \ge 2$ is the base. They satisfy the property that $P_{b^k}$ contains $P_{b^{k-1}}$ for all $k$ in this range. We call them *embedded* point sets. For these types of point sets it is highly recommended to specify the range in a comment in the header of the file [^2]. +--- -[^2]: The range for which the points were built could be given in the file, but this makes things a bit more complicated for some point sets. For example, for ordinary lattice rules with a prime number of points, this additional info might be confusing for some users. For Sobol points, the range has no limit. +## Supported Tasks and Applications -In the proposed formats, the files do not assume a given computer word size (e.g., 32 bits or 64 bits). The format is exactly the same regardless of the word size. Of course, if the file contains integers of more than 32 bits, the corresponding points cannot be generated properly on a 32 bit computer. A comment in the file header can say it. +### ✔️ Quasi-Monte Carlo (QMC) +Generate deterministic point sets with excellent equidistribution. -Some users might prefer input files with no header at all, only the $s$ lines that give the generating vector or generating matrices. In some languages (e.g., MATLAB), such a file can be read into a matrix by a simple "load file" command, so there is no need to to write any code to read the file. Users who want that can simply strip out the header from the files in standard format and use these naked files privately. We think that the header with human-readable comments as imposed by the standard will be very useful to many users. +### ✔️ Randomized QMC (RQMC) +Use the included randomizations for variance estimation: +- Digital shifts +- Nested uniform scrambles +- Left-matrix scrambles -The following sections describe the proposed text-file formats for the different point sets. +### ✔️ High-dimensional Integration and Simulation +Used in: +- Bayesian computation +- Option pricing +- High-dimensional PDE solvers +- Uncertainty quantification +- Graphics and rendering research +- Machine learning sampling methods -## Types of point sets and notation +### ✔️ Benchmarking +Standard formats help evaluate new constructions against established ones. +--- -### Parameters for ordinary lattice rules: `lattice` +## Features -For an ordinary *lattice rule of rank 1*, we have +- Simple `.txt` formats with **one line per dimension** +- Optional human-readable comments starting with `#` +- No binary encoding or word-size assumptions +- Supports extremely high dimensions (10,000+) +- Extensible constructions (e.g., Sobol or embedded nets) +- All formats interoperable with QMC software (SSJ, QMCPy, LatNet Builder) -$$P_n = \{\boldsymbol{u}_i = (i \boldsymbol{a} \bmod n)/n,\; i=0,\dots,n-1\}$$ +--- -where $\boldsymbol{a} = (a_1,\dots,a_s)$ is the generating vector. We must specify $s$, $n$, and $\boldsymbol{a}$. +## How to Use the Dataset -In a `lattice` file, the first line must start with `# lattice`. After that, not counting the comment lines, the first line gives the number $s$ of dimensions, the second line gives the number $n$ of points, and lines 3 to $s+2$ give the coefficients $a_1,\dots, a_s$, one value per line. In the case of embedded lattice rules, $n$ would usually be a power of 2, say $n=2^k$, and the smaller embedded lattices will contain the first $2^{k-1}$ points, the first $2^{k-2}$ points, etc. -Additional comments in the file should tell when the lattice is embedded, which figure of merit and what weights were used, the construction method, etc. [^3] +### Load files directly from Hugging Face -[^3]: Should we be forced to always put the embedding range in the file for the computer to read? My suggestion is not to force the computer to read it, but just put it as a comment for humans, Otherwise, it will force an additional line that gives the base and the range. What would we put for example if $n$ is prime and the rule is not embedded? Should we have a `lattice2` format for embedded rules in base 2? Putting more options makes things more complicated. - -One example of a parameter file for an ordinary lattice rule, in `lattice` format is given below. In this file, the first line is skipped, only the number `8` is read on the second line, only the number `65536` is read on the second line, etc. +```python +from datasets import load_dataset +ds = load_dataset("QMCSoftware/LDData") ``` -# lattice -# A lattice rule, non-embedded, in 'lattice' format -8 # 8 dimensions -65536 # modulus = n = 65536 points -# coordinates of the generating vector, starting at j=1: -1 -19463 -17213 -5895 -14865 -31925 -30921 -26671 + +All data files are preserved in their directory structure and can be accessed +using: + +```python +ds["train"] # or ds['default'] ``` -### Parameters for digital nets: `dnet` +### Typical workflow -A *digital net in base* $b$ with $n=b^k$ points is defined by selecting integers $s \geq 1$, $r \geq k \geq 1$, and $s$ matrices $\boldsymbol{C}_1,\dots,\boldsymbol{C}_s$ of size $r \times k$ with entries in $\mathbb{Z}_b$, called the generating matrices. For $i=0,\dots,n-1$, let $i = \sum_{\ell=0}^{k-1} a_{i,\ell} b^\ell$ be the expansion of $i$ in base $b$, and for $j=1,\dots s$, let +1. Read a parameter file (e.g. `lattice_8d.txt`) +2. Parse header (`# lattice`, dimensions, n, etc.) +3. Parse one line per dimension for the generating vector or matrices +4. Construct QMC point generator in your preferred library -$$(y_{i, j, 1}, \dots, y_{i, j, r})^T = \boldsymbol{C}_j \cdot (a_{i, 0}, \dots, a_{i, k-1})^T$$ +--- -and +## Dataset Structure -$$u_{i, j} = \sum_{\ell=1}^{r} y_{i, j, \ell} b^{-\ell}.$$ +The dataset includes multiple categories of files: -The points $\boldsymbol{u}_i$ are defined by $\boldsymbol{u}_i = (u_{i,1},\dots,u_{i,s})$. Digital nets are usually in base $b=2$, but we allow a general (typically prime) base $b \ge 2$. +### 🔹 `lattice` +Rank-1 lattice generating vectors: +- Header: `# lattice` +- Parameters: + - Number of dimensions `s` + - Number of points `n` + - `s` lines of generating vector coefficients -The proposed format to specify digital nets is as follows. The first line must start with `# dnet`. Then the first four non-comment lines give $b$ (the base), $s$ (the number of dimensions), $k$ (the number of columns), and $r$ (the number of rows in the generating matrices in base $b$). Thus, the output values will have "precision" $b^{-r}$ (they will be integer multiples of $b^{-r}$). For $b=2$, a common value in the past has been $r=31$ when using 32 bit integers, but going forward we should use 64 bit integers and $r=63$ or 64, or perhaps $r=53$ to exploit the full accuracy of a `double`. By looking at $r$, one can see right away whether this file is good for 64 bit computers only or for 32 bit computers as well. +--- -The $s$ lines after this header will contain the $s$ generating matrices, one per line. Each of these lines contains $k$ integers smaller than $b^r$ giving the $k$ columns of $\boldsymbol{C}_j$, using by default the same encoding as in the class `DigitalNetBase2` in SSJ for $b=2$. That is, the base $b$ representation of the integer gives the $r$ digits in the corresponding column, with the digit on the first row of the matrix (row 0) being the most significant, and the one on the last row (row $r-1$) being the least significant. For example, if $b=2$, $r=31$, and the first column has a 1 in the first row and 0 in all other rows, -as is always the case for Sobol points, then the integer representation of this column will be $2^{30} = 1\,073\,741\,824$. If there is a 1 in the last row and 0 elsewhere, the representation will be $2^0 = 1$. If all 31 elements of the column are 1, the representation will be $2^{31}-1$. +### 🔹 `dnet` +General digital nets in base `b`: +- Header: `# dnet` +- Parameters: + - Base `b` + - Dimensions `s` + - Columns `k` + - Rows `r` +- Then `s` lines representing generating matrices -One example of a file for a digital net in `dnet` format: +Efficient for high-dimensional digital nets. -``` -# dnet -# A digital net in base 2, in 'dnet' format -2 # basis b = 2 -8 # s = 8 dimensions -10 # k = 10, so n = 2^10 = 1024 points -31 # r = 31 digits -# The columns of gen. matrices C_1, ..., C_s, one matrix per line: -1073741824 536870912 268435456 134217728 ... -2012537125 1382645254 ... -... -``` +--- -This differs from Joe and Kuo (2008), where the $c$-th column (for $c\ge 1$) is represented by an integer smaller than $2^c$ (in base 2) and the least significant bit is the one on the diagonal. Their representation works when $\boldsymbol{C}_j$ is upper triangular, which is true for Sobol point sets, but not for digital nets in general. +### 🔹 `plattice` +Polynomial lattice rules: +- Compact format using integer-encoded polynomials +- Base `b`, dimension `s`, polynomial degree `k`, and generating polynomials -Recall that coordinate $j$ of the $i$-th point is obtained by multiplying the base $b$ matrix $\boldsymbol{C}_j$ by the vector of digits of the representation of $i$ in base $b$, with the least significant digits of $i$ at the top. That is, the least significant digit of $i$ goes with the first column of $\boldsymbol{C}_j$. And the first row of $\boldsymbol{C}_j$ is for the most significant digit of output. With our representation of $\boldsymbol{C}_j$ by $k$ integers, the points are easy and fast to generate in base 2. We obtain `u[i,j]`, coordinate $j$ of point $i$, with the following code snippet, in which `C[j,c]` is the integer that represents column $c$ of $\boldsymbol{C}_j$: +--- -```python -normFactor = 1.0 / (1 << r) # 2^(-r) -coord = 0 -for c in range(k): - coord ^= ((i >> c) & 1) * C[j,c] - u[i,j] = coord * normFactor -``` +### 🔹 `sobol` and `soboljk` +Parameters for Sobol' sequences: +- `soboljk`: Joe & Kuo format with primitive polynomials and direction numbers +- `sobol`: Simplified direction-number only format +Used widely in QMC applications. -### Parameters for polynomial lattice rules: `plattice` +--- -*Polynomial lattice rules* are a special type of digital nets with generating matrices of a special form. For a polynomial lattice rule of rank 1 in a prime base $b$, we have +### 🔹 Randomization formats -$$P_n = \left\{\left(\varphi\left(\frac{h(z) a_1(z)}{Q(z)}\right), \dots, \varphi\left(\frac{h(z) a_s(z)}{Q(z)}\right)\right) : h(z)\in\mathbb{F}_b[z], \ \text{degree}(h(z)) < k\right\}.$$ +Includes: -where $\mathbb{F}_b[z]$ is the space of polynomials with coefficients in $\mathbb{F}_b$, the *modulus* $Q(z) \in \mathbb{F}_b[z]$ is a polynomial of degree $k$, the *generating vector* $\boldsymbol{a}(z) = (a_1(z),\dots,a_s(z))\in \mathbb{F}_b[z]^s$ is a vector of $s$ polynomials of degrees less than $k$, and the mapping $\varphi$ is defined by +- `shiftmod1`: Shift modulo 1 +- `dshift`: Digital shift in base `b` +- `nuscramble`: Nested uniform scramble +- `lmscramble`: Left matrix scramble -$$\varphi\left(\sum_{l = w}^{\infty} x_l z^{-l}\right) = \sum_{l = \max(w, 1)}^{\infty} x_l b^{-l}.$$ +All formats are text-based and reproducible. -This point set has $n = b^k$ points. +--- -We must specify the polynomial modulus $Q(z)$ and the polynomial generating vector $\boldsymbol{a}(z)$. The polynomial modulus will be represented as an integer that has $(k+1)$ digits in base $b$, and all the other polynomials will be represented as integers that have no more than $k$ digits in base $b$. All these integers will be given in base 10 in the file, one per line. In practice, we usually have $b=2$, so $k$ represents the number of bits. The integer that represents a polynomial is obtained simply by replacing the formal variable by $b$. For example, if the polynomial is $Q(z) = z^4 + z^3 + 1$ and $b=2$, its coefficients are "1 1 0 0 1" and its integer representation is $2^4 + 2^3 + 1 = 25$. This is the usual representation, as used in Goda and Dick (2015), for example. In the case of embedded point sets, the modulus should be $Q(z) = z^k$ for $n=b^k$ points, and its integer representation is $b^k$. In particular, $Q(z) = z$ is represented by the integer $b$. +## Example: Parsing a Lattice Rule File -As usual, the first line is a comment that tells the type of file. Then the first four non-comment lines give the base $b$, the number $s$ of dimensions, the degree $k$ of the polynomial modulus, and the integer representation of this polynomial. Lines 5 to $s+4$ give the polynomials that form the generating vector, one per line, using the integer representation just explained. One example of a file for a polynomial lattice in the `plattice` format: +Example file: ``` -# plattice -# A polynomial lattice rule in base 2, in 'plattice' format -2 # base b = 2 -8 # s = 8 dimensions -16 # n = 2^16 = 65536 points -45781 # polynomial modulus -# coordinates of the generating vector, starting at j=1: -1 +# lattice +8 +65536 +1 +19463 17213 5895 14865 31925 30921 -26671 -17213 -``` - -A polynomial lattice rule in base $b$ can also be represented as a digital net in base $b$, so its parameters can also be provided in a file in the `dnet` format, as for general digital net in base $b$. But the generating matrices have a special form and the above representation is much more compact (a single integer per row instead of $k$ integers per row). On the other hand, generating the points is faster with the generating matrices than with the polynomial representation, so the software that will use the `plattice` files and generate the points would usually first convert the polynomials into the corresponding generating matrices. LatNet Builder (L’Ecuyer et al., 2022) is also able to make the conversion and produce a file in the `dnet` format, for more convenience and better flexibility, so the user can select the format she/he prefers. - -### Parameters for Sobol nets: `sobol` and `soboljk` - -The Sobol' construction provides another special case of digital nets (and sequences), in base 2. They are defined in many places, including Joe and Kuo (2008). For each coordinate $j$, we select a primitive polynomial $p_j(z)$ of degree $c_j$, and $c_j$ integers $m_{j,1},\dots,m_{j,c_j}$ which are used to define the generating matrix $\boldsymbol{C}_j$. The real numbers $2^{-c} m_{j,c}$ are called the -initial *direction numbers*. More details are given in Joe and Kuo (2008) and [here](http://umontreal-simul.github.io/ssj/docs/master/classumontreal_1_1ssj_1_1hups_1_1SobolSequence.html). - -One obvious option for these point sets is to adopt exactly the same format as Joe and Kuo (2008), because it is already used in many places. The only difference is that we now allow comment lines in the file. In the format of Joe and Kuo (2008), only the first line is skipped. In the proposed format, other comment lines can be added at the beginning of the file, e.g., to give the maximum number of dimensions in the file, the criterion and weights that were used, etc. -Note that Sobol' sequences have an infinite number of points and an unlimited number of dimensions, although the file will give parameters for a finite number of dimensions. - -The other lines of the file specify the primitive polynomials and the initial direction numbers for each dimension $j \ge 2$, one line per dimension. For dimension $j=1$, the generating matrix is the identity and is not given in the file (it is implicit). The columns of this matrix are not obtained via a recurrence based on a primitive polynomial, -so this matrix is handled separately. - -The first number on each line is the dimension $j$. -The second number is the degree $c_j$ of the primitive polynomial $p_j(x)$ used for this dimension. The third number is the integer that corresponds to the binary representation of the inner coefficients of this polynomial (we ignore the first and last coefficients, they are always 1). For example, if the polynomial is $p_j(x) = x^4 + x^3 + 1$, the coefficients are "1 1 0 0 1", and after removing the first and last "1", we get 100 in base 2, which is 4, so the third column would contain the number 4. (Without removing the first and last "1", the number would be 25 instead.) After these three numbers, there are $c_j$ integers $m_{j,1},\dots,m_{j,c_j}$ where $m_{j,c}$ is the $c^\text{th}$ (real-valued) initial direction number for this coordinate, multiplied by $2^c$ to obtain an integer. This $m_{j,c}$ is the integer formed by taking the bits in row 1 to row $c$ of column $c$, in this order. The last bit is the bit on the diagonal, which is always 1, so all $m_{j,c}$'s are odd integers. I think this format comes from Bratley and Fox (1988). - -We denote this format for Sobol parameters by the `soboljk` keyword. One example of a file in this format is shown below. The first line gives the type of file and the next three lines are comments that must be skipped by the reading program. - -``` -# soboljk -# Parameters for Sobol points, in 'soboljk' format -# 8 dimensions -# c_j p_j m_{j,c} -2 1 0 1 -3 2 1 1 3 -4 3 1 1 3 1 -5 3 2 1 1 1 -6 4 1 1 1 3 3 -7 4 4 1 3 5 13 -8 5 2 1 1 5 5 17 +26671 ``` -The `soboljk` format can be simplified as follows. First, removing the first and last "1" in the representation of the primitive polynomials saves a bit of memory, but it also makes thinks slightly more complicated. In the default representations of the primitive polynomials in the code that generates the points, these bits are usually not removed. In SSJ, the first thing we do when reading a file in `soboljk` format is to add them back. Also, the primitive polynomials can be in a separate file, since they never change, and only the (initial) direction numbers (those depend on the selected FOM and weights) would be given to specify the Sobol' points. That is, we remove the first three columns of the `soboljk` format. The Magic Point Shop (Nuyens, 2020) also produces files that contain only the direction numbers. +Python pseudo-code: -One example of a file in this `sobol` format: +```python +with open("lattice_8d.txt") as f: + lines = [l for l in f.readlines() if not l.startswith("#")] -``` -# sobol -# Parameters m_{j,c} for Sobol points, in 'sobol' format -# 8 dimensions -1 # This is m_{j,c} for the second coordinate -1 3 -1 3 1 -1 1 1 -1 1 3 3 -1 3 5 13 -1 1 5 5 17 +s = int(lines[0]) +n = int(lines[1]) +a = [int(x) for x in lines[2:2+s]] ``` -A list of the first few primitive polynomials in base 2 is given [here](https://mathworld.wolfram.com/PrimitivePolynomial.html). If we *do not* remove the first and last 1's in their representations, the first primitive polynomials are: `3, 7, 11, 13, 19, 25, ...`. Their degrees are `1, 2, 3, 3, 4, 4, ...`. This representation is the one used in the code of SSJ, for example. We can have a separate file that gives these polynomials, one per line, exactly as in the first three columns of the `soboljk` format. We may also want to remove the first column. +--- -Another, perhaps more convenient, way of storing Sobol' constructions is to just use the general `dnet` format, in which the generating matrices are given explicitly. This `dnet` format is easier to use. On the other hand, it requires specifying a (maximum) value of $k$, and $k$ integers per row to specify the generating matrices, which leads to larger files. From a file in `sobol` format, one can construct a digital net with an arbitrarily large $k$. +## File Naming Recommendations -When $n = 2^k$ is fixed, so we use exactly $n = 2^k$ points and there is there no embedding, we can add one *extra dimension* at the beginning by using the reflected identity as a generating matrix. The successive values for this coordinate will then be $0, 1/n, 2/n, 3/n, \dots$ in this order. This matrix will not be given in the file for Sobol' points; the QMC/RQMC software must handle it. For lattice rules and general digital nets with fixed $n$ (non-embedded), the file could give a first coordinate with this behavior. +To support discoverability and consistent tooling: -## Files that contain randomizations +- All files begin with their keyword (`lattice_`, `dnet_`, `sobol_`, etc.) +- Headers contain: + - Construction method + - Figure of merit (FOM) + - Weights + - Embedded range (if applicable) +- Comments allowed in headers only -The idea of proposing a format for storing specific randomizations was suggested by Fred Hickernell. This can be useful for verification purposes, for example. +--- -We can store randomizations in the following file formats: +## References -- `shiftmod1` -A (random) shift modulo 1. It corresponds to a single point in $[0,1)^s$. -- `dshift` -A digital shift in base $b$. -Also a single point in $[0,1)^s$, but with $r$ digits in base $b$. -- `nuscramble` -A nested uniform scramble in base $b$. -- `lmscramble` -A (linear) left matrix scramble in base $b$. +This dataset incorporates formats and ideas from foundational work in QMC: +- Bratley & Fox (1988) +- Joe & Kuo (2008) +- L’Ecuyer (2016) +- Goda & Dick (2015) +- Nuyens (2020) +- And others listed in the detailed specification [LD_DATA.md](LD_DATA.md). -For a `shiftmod1` in $s$ dimensions, the file will contain $s$ in the first line, followed by $s$ real numbers between 0 and 1, one per line. +--- -``` -# shiftmod1 -# A shift modulo 1, in 'shiftmod1' format -3 # s = 3 dimensions -0.32638741823951621 -0.91325392536931693 -0.1530364040t106301 -``` - -For a `dshift` with $r$ digits of accuracy in base $b$, in $s$ dimensions, the file will contain $b$ in the first line, $s$ in the second line, $r$ in the third line, and then $s$ integers from 0 to $b^r-1$, one per line. For the latter, the digits of the base $b$ representation of the integer divided by $b^r$ will be added modulo $b$ to the corresponding digits of the base $b$ representation of the coordinate. For example, if $b=2$ and $r=31$, the randomization makes a xor of the 31 bits of this integer with the 31 most significant bits of the corresponding coordinate of each point. - -``` -# dshift -# A digital shift in base 2, in 'dshift' format -2 # b = 2 -3 # s = 3 -31 # r = 31 -2146832861 -1084390381 -963462828 -``` +## Citation -For a `lmscramble` with $r$ digits of accuracy, for $b^k$ points in base $b$ in $s$ dimensions, we need to store $s$ lower-triangular invertible $r\times r$ matrices with entries in $\{0,\dots,b-1\}$. For $b=2$, each matrix must have only 1's on the diagonal and 0's above the diagonal. Each such matrix can be stored in one line of the file, in exactly the same format as the generating matrices in the `dnet` format, using one integer for each column. We want them in this format for the fast LMS implementation we have in SSJ, for example. The file will contain $b$ in the first non-comment line, $s$ in the second line, $r$ in the third line, -and then $s$ square lower-triangular and invertible $r\times r$ matrices, one per line, with each column represented as an integer as in the `dnet` format. Thus, each scrambling matrix is represented by $r$ integers on the same line. -Here is an example, +If you use LDData in academic work, please cite: ``` -# lmscramble -# A left matrix scramble in base 2, with 31 digits of resolution. -2 # basis b = 2 -8 # s = 8 dimensions -31 # r = 31 digits -# The columns of the lower-triangular r x r scrambling matrices, one matrix per line: -1673741824 906870912 615843556 213427728 ... -2012537125 1012645254 ... -... +@article{sorokin2025, + title = {{QMCPy}: a {P}ython software for randomized low-discrepancy sequences, quasi-{M}onte {C}arlo, and fast kernel methods}, + author = {Aleksei G. Sorokin}, + year = {2025}, + journal = {ArXiv preprint}, + volume = {abs/2502.14256}, + url = {https://arxiv.org/abs/2502.14256}, +} + + +@inproceedings{choi2022, + title = {Quasi-{M}onte {C}arlo software}, + author = {Choi, Sou-Cheng T. and Hickernell, Fred J. and Rathinavel, Jagadeeswaran and McCourt, Michael J. and Sorokin, Aleksei G.}, + year = {2022}, + booktitle = {{M}onte {C}arlo and Quasi-{M}onte {C}arlo Methods 2020}, + publisher = {Springer International Publishing}, + address = {Cham}, + pages = {23--47}, + isbn = {978-3-030-98319-2}, + editor = {Keller, Alexander}, +} ``` -For a `nuscramble` of the first $r \ge k$ digits, for $n=b^k$ points in base $b$ in $s$ dimensions, with the implementation proposed in Section 3 of Friedel and Keller (2002) and used for $b=2$ in class `DigitalNetBase2` of SSJ, we need $sn$ blocks of $r$ random digits in base $b$. Each such block can be represented as an integer in the range $\{0,1,\dots,b^r-1\}$. For $b=2$, these are $r$ bit integers. We can store these integers one row per dimension, $n$ integers per row. This gives the following `nuscramble` file format. The first non-comment line contains the base $b$, the second line gives the number $s$ of dimensions, the third line gives the scramble resolution (the number of digits that are scrambled), and the following $s$ lines give the $sn$ integers used for the scrambling, $n$ integers per line. Note that this is the same amount of random numbers that we would need if we use plain Monte Carlo instead of RQMC[^4][^5]. - -[^4]: Another way of storing the NUS is as follows. For each coordinate $j$, each point can be identified by a $k$ bit integer, and the NUS maps each such $k$ to a $r$ bit integer that corresponds to the scrambled coordinate $j$ of this point. So we can simply store this map in an array of size $b^k$ whose entry $i$ contains the corresponding $r$ bit integer. Applying this NUS is then fast and straightforward. - -[^5]: Alternative implementations of NUS that use a hashing function in place of a RNG are proposed in Burley (2020) and Laine and Karras (2011). These methods might be faster and the is much less information to store to reproduce a given scramble, but the hashing function must be fixed, known, and reliable. This essentially amount to fixing the RNG and storing only its seed. - -``` -# nuscramble -# A nested uniform scramble in base 2, with 30 bits of resolution. -2 # basis b = 2 -8 # s = 8 dimensions -10 # k = 10, so n = 2^10 = 1024 points -30 # r = 30 digits -# The following s rows contain n = 1024 30 bit integers per row: -1173741824 906870912 615843556 213427728 ... -1012537125 1001975254 ... -... -``` +--- -## File names and other recommendations +## License -It is strongly recommend that all file names start with the corresponding keyword, like `plattice` for a polynomial lattice rule, `sobol` for a Sobol point set, and `lmscramble` for a left matrix scramble, for example. +Apache 2 License. +See [`LICENSE`](LICENSE.txt) file for details. -It is also recommended to put enough relevant comments in each file for a knowledgeable human to find what the file is for (type of point set, figure of merit and weights that were used to construct it, range of values of $n$ for embedded point sets, etc.). +--- +## Acknowledgements -We also want some unit tests: some specific parameter files together with the correct output that should be observed when generating the points from these files. +This dataset is developed and maintained by: -## References +- **QMCSoftware team** +- Contributors to QMCPy, SSJ, and LatNet Builder +- Community contributions from QMC & RQMC researchers -- P. Bratley and B. L. Fox. Algorithm 659: Implementing Sobol’s quasirandom sequence generator. ACM Transactions on Mathematical Software, 14(1):88–100, 1988. -- Brent Burley. Practical hash-based Owen scrambling. The Journal of Computer Graphics Techniques, 9(4):1–20, 2020. -- I. Friedel and A. Keller. Fast generation of randomized low-discrepancy point sets. In K.-T. Fang, F. J. Hickernell, and H. Niederreiter, editors, Monte Carlo and Quasi-Monte Carlo Methods 2000, pages 257–273, Berlin, 2002. Springer-Verlag. -- T. Goda and J. Dick. Construction of interlaced scrambled polynomial lattice rules of arbitrary high order. Foundation of Computational Mathematics, 15:1245–1278, 2015. -- S. Joe and F. Y. Kuo. Constructing Sobol sequences with better two-dimensional projections. SIAM Journal on Scientific Computing, 30(5):2635–2654, 2008. -- S. Laine and T. Karras. Stratified sampling for stochastic transparency. Computer Graphics Forum, 30(4):1197–1204, 2011. -- P. L’Ecuyer. SSJ: Stochastic simulation in Java. http://simul.iro.umontreal.ca/ssj/, accessed 9th August 2021, 2016. -- P. L’Ecuyer, P. Marion, M. Godin, and F. Puchhammer. A tool for custom construction of QMC and RQMC point sets. In A. Keller, editor, Monte Carlo and Quasi-Monte Carlo Methods: MCQMC 2020, pages 51–70, Berlin, 2022. Springer. https://arxiv.org/abs/2012.10263. -- D. Nuyens. The magic point shop, 2020. https://people.cs.kuleuven.be/~dirk.nuyens/qmc-generators/. +Special thanks to researchers providing widely used generating vectors and direction numbers used throughout the scientific computing community. diff --git a/env.yml b/env.yml index 8cdc4d3..2eeb851 100644 --- a/env.yml +++ b/env.yml @@ -101,8 +101,8 @@ dependencies: - python-json-logger==3.3.0 - pyyaml==6.0.2 - pyzmq==26.3.0 - - qmcpy==1.6.2 - - qmctoolscl==1.1.2 + - qmcpy==2.0 + - qmctoolscl==1.1.5 - referencing==0.36.2 - requests==2.32.3 - rfc3339-validator==0.1.4 @@ -126,3 +126,5 @@ dependencies: - webcolors==24.11.1 - webencodings==0.5.1 - websocket-client==1.8.0 + - huggingface_hub>=0.32.0 + - pytest==9.0.1 diff --git a/pyproject.toml b/pyproject.toml new file mode 100644 index 0000000..8d63b31 --- /dev/null +++ b/pyproject.toml @@ -0,0 +1,16 @@ +[build-system] +requires = ["setuptools>=61.0", "wheel"] +build-backend = "setuptools.build_meta" + +[project] +name = "LDData" +version = "0.0.0" +description = "LDData helper package for tests" +readme = "README.md" +requires-python = ">=3.11" +license = { text = "MIT" } +authors = [ { name = "QMCSoftware" } ] + +[tool.setuptools.packages.find] +where = ["."] +include = ["LDData"] diff --git a/pytest.ini b/pytest.ini new file mode 100644 index 0000000..20437c8 --- /dev/null +++ b/pytest.ini @@ -0,0 +1,6 @@ +[pytest] +minversion = 6.0 +addopts = -q -m "not integration" +testpaths = tests +markers = + integration: mark tests that require network access or are destructive to remote state \ No newline at end of file diff --git a/scripts/git_lfs_upload.sh b/scripts/git_lfs_upload.sh new file mode 100644 index 0000000..44488a3 --- /dev/null +++ b/scripts/git_lfs_upload.sh @@ -0,0 +1,241 @@ +#!/usr/bin/env bash +# git_lfs_upload.sh +# +# Automate uploading folders/files to a Hugging Face dataset repo using git + git-lfs. +# This preserves folder structure, avoids API rate limits, and handles large files. +# +# Usage: +# 1) Make executable: +# chmod +x scripts/git_lfs_upload.sh +# 2) Run: +# ./scripts/git_lfs_upload.sh \ +# --repo-id QMCSoftware/LDData \ +# --local-path /Users/terrya/Documents/ProgramData/LDData \ +# --folders dnet,lattice,pregenerated_pointsets,README.md \ +# --branch main +# +# Environment notes: +# - If the repo is private, set HF_TOKEN in your environment or pass --token. +# If HF_TOKEN is present, the script will embed it into the clone URL for non-interactive auth. +# - Install prerequisites: git, git-lfs, rsync (macOS: `brew install git-lfs rsync`). +# - The script commits and pushes each folder separately to reduce the size of each push. + +set -euo pipefail +IFS=$'\n\t' + +REPO_ID="" +LOCAL_PATH="." +FOLDERS="" +BRANCH="main" +HF_TOKEN="${HF_TOKEN:-}" +CLONE_DIR="hf_repo" +EXCLUDES=() +LFS_PATTERNS=("*.bin" "*.zip" "*.tar" "*.tgz" "*.h5" "*.npy" "*.npz" "*.ckpt" "*.pt" "*.pth" "*.gz") + +print_usage() { + cat < Hugging Face dataset repo id (required) + --local-path Local LDData root (default: .) + --folders Comma-separated folders/files to upload (e.g. dnet,lattice,README.md) + --branch Git branch to push to (default: main) + --token Hugging Face token (or set HF_TOKEN env var) + --clone-dir Directory to clone the repo into (default: hf_repo) + --exclude Add an rsync exclude pattern (can be supplied multiple times) + -h, --help Show this help and exit + +Example: + $0 --repo-id QMCSoftware/LDData --local-path . --folders dnet,lattice,README.md + +This script: + - clones the target dataset repo, + - copies the requested folders/files into the clone (preserving structure), + - enables git-lfs for common large file types, + - commits and pushes each folder/file separately to reduce push sizes. + +EOF +} + +# Simple arg parsing +while [[ $# -gt 0 ]]; do + case "$1" in + --repo-id) + REPO_ID="$2"; shift 2;; + --local-path) + LOCAL_PATH="$2"; shift 2;; + --folders) + FOLDERS="$2"; shift 2;; + --branch) + BRANCH="$2"; shift 2;; + --token) + HF_TOKEN="$2"; shift 2;; + --clone-dir) + CLONE_DIR="$2"; shift 2;; + --exclude) + EXCLUDES+=("$2"); shift 2;; + -h|--help) + print_usage; exit 0;; + *) + echo "Unknown arg: $1" >&2; print_usage; exit 2;; + esac +done + +if [[ -z "$REPO_ID" ]]; then + echo "--repo-id is required" >&2 + print_usage + exit 2 +fi + +# Normalize LOCAL_PATH +LOCAL_PATH=$(cd "$LOCAL_PATH" && pwd) + +echo "Repo ID: $REPO_ID" +echo "Local path: $LOCAL_PATH" +echo "Folders: $FOLDERS" +echo "Branch: $BRANCH" + +# Check dependencies +command -v git >/dev/null 2>&1 || { echo "git not found; install git." >&2; exit 1; } +command -v rsync >/dev/null 2>&1 || { echo "rsync not found; install rsync." >&2; exit 1; } +if ! command -v git-lfs >/dev/null 2>&1; then + echo "git-lfs not found; installing is recommended. Please install git-lfs and run 'git lfs install'." >&2 + echo "On macOS: brew install git-lfs && git lfs install" >&2 + read -p "Continue without git-lfs? [y/N]: " c + if [[ "$c" != "y" && "$c" != "Y" ]]; then + exit 1 + fi +fi + +# Prepare clone URL (do NOT embed token in the URL) +CLONE_URL="https://huggingface.co/datasets/${REPO_ID}.git" + +# If HF_TOKEN is provided, use a temporary GIT_ASKPASS helper so the token +# is not exposed on the command line or process list. The helper prints the +# token when git prompts for a password. We keep the helper for the duration +# of the script (so pushes work) and remove it on exit. +if [[ -n "$HF_TOKEN" ]]; then + ASKPASS_SCRIPT=$(mktemp -t hf_askpass.XXXXXX) + # Write an askpass helper that prints the token from the environment. + # Use a quoted heredoc so $HF_TOKEN is not expanded into the file. + cat > "$ASKPASS_SCRIPT" <<'ASKPASS_EOF' +#!/usr/bin/env sh +# GIT_ASKPASS helper: print HF_TOKEN from the environment (do not echo a newline) +printf "%s" "$HF_TOKEN" +ASKPASS_EOF + chmod 700 "$ASKPASS_SCRIPT" + export GIT_ASKPASS="$ASKPASS_SCRIPT" + # Prevent git from falling back to terminal prompting + export GIT_TERMINAL_PROMPT=0 + + cleanup_askpass() { + unset GIT_ASKPASS + unset GIT_TERMINAL_PROMPT + rm -f "$ASKPASS_SCRIPT" || true + } + trap cleanup_askpass EXIT +fi + +# Clone the repo +if [[ -d "$CLONE_DIR" ]]; then + echo "Removing existing clone dir $CLONE_DIR" + rm -rf "$CLONE_DIR" +fi + +echo "Cloning ${CLONE_URL} -> ${CLONE_DIR}" +if ! git clone --depth 1 --branch "$BRANCH" "$CLONE_URL" "$CLONE_DIR"; then + echo "Initial clone failed; trying full clone (no depth)" + git clone --branch "$BRANCH" "$CLONE_URL" "$CLONE_DIR" +fi + +pushd "$CLONE_DIR" >/dev/null + +# Configure git user if not set +if ! git config user.email >/dev/null; then + git config user.email "uploader@example.com" +fi +if ! git config user.name >/dev/null; then + git config user.name "LDData uploader" +fi + +# Ensure branch exists locally +git checkout -B "$BRANCH" + +# Set up git-lfs patterns (only add if git-lfs available) +if command -v git-lfs >/dev/null 2>&1; then + echo "Configuring git-lfs patterns: ${LFS_PATTERNS[*]}" + for pat in "${LFS_PATTERNS[@]}"; do + git lfs track --no-update "$pat" || true + done + # Ensure .gitattributes is added + git add .gitattributes || true + git commit -m "Add git-lfs tracking patterns" --allow-empty || true +fi + +# Helper to build rsync exclude args +RSYNC_EXCLUDE_ARGS=() +for ex in "${EXCLUDES[@]}"; do + RSYNC_EXCLUDE_ARGS+=(--exclude "$ex") +done + +# Copy function: copy a single folder or file into the clone preserving path +copy_item() { + local item="$1" + echo "Processing: $item" + if [[ -d "$LOCAL_PATH/$item" ]]; then + mkdir -p "$(dirname "$item")" + rsync -av --delete "${RSYNC_EXCLUDE_ARGS[@]}" "$LOCAL_PATH/$item" ./ + elif [[ -f "$LOCAL_PATH/$item" ]]; then + mkdir -p "$(dirname "$item")" + rsync -av "${RSYNC_EXCLUDE_ARGS[@]}" "$LOCAL_PATH/$item" ./ + else + echo "Warning: $item not found in $LOCAL_PATH; skipping" + fi +} + +# Commit & push a path (folder or file) +commit_and_push() { + local path="$1" + git add --all "$path" || true + if git diff --staged --quiet; then + echo "No changes staged for $path" + return + fi + git commit -m "Upload $path" || true + echo "Pushing $path to origin/$BRANCH" + git push origin "$BRANCH" +} + +# If folders list is empty, upload whole workspace excluding excludes +if [[ -z "$FOLDERS" ]]; then + echo "No --folders provided; copying whole local tree (respecting excludes)." + rsync -av --delete "${RSYNC_EXCLUDE_ARGS[@]}" "$LOCAL_PATH/" ./ + commit_and_push "." +else + # iterate comma-separated list + IFS=',' read -ra ITEMS <<< "$FOLDERS" + for it in "${ITEMS[@]}"; do + it_trimmed=$(echo "$it" | sed 's/^\s*//;s/\s*$//') + if [[ -z "$it_trimmed" ]]; then + continue + fi + copy_item "$it_trimmed" + commit_and_push "$it_trimmed" + # pause between folder pushes to avoid network bursts + sleep 3 + done +fi + +# Final push of any remaining changes +git add --all || true +if ! git diff --staged --quiet; then + git commit -m "Upload remaining files" || true + git push origin "$BRANCH" +fi + +popd >/dev/null + +echo "Upload complete. Repository at: https://huggingface.co/datasets/${REPO_ID}" + +# End of script diff --git a/tests/conftest.py b/tests/conftest.py new file mode 100644 index 0000000..60c9fcc --- /dev/null +++ b/tests/conftest.py @@ -0,0 +1,6 @@ +import sys +from pathlib import Path + +# Ensure repository root is on sys.path so tests can import the local package +ROOT = Path(__file__).resolve().parents[1] +sys.path.insert(0, str(ROOT)) diff --git a/tests/test_hf_dataset.py b/tests/test_hf_dataset.py new file mode 100644 index 0000000..94dac2b --- /dev/null +++ b/tests/test_hf_dataset.py @@ -0,0 +1,16 @@ +from huggingface_hub import HfApi + + +def test_sou_cheng_lddata_exist_and_has_readme(): + """Fast smoke test: ensure the public dataset repo exists and includes a README.""" + api = HfApi() + files = api.list_repo_files("Sou-Cheng/LDData", repo_type="dataset") + + assert isinstance(files, list), "Expected list of files from Hugging Face" + assert len(files) > 0, "Dataset 'Sou-Cheng/LDData' appears to be empty or not accessible" + + # Check for README presence in a case-insensitive manner + lowered = [f.lower() for f in files] + assert any(name.endswith("readme.md") for name in lowered), ( + "Expected a README.md in the dataset files; got: " + ", ".join(files[:10]) + ) diff --git a/tests/test_integration_hf.py b/tests/test_integration_hf.py new file mode 100644 index 0000000..5059b08 --- /dev/null +++ b/tests/test_integration_hf.py @@ -0,0 +1,70 @@ +import os +import shutil +import tempfile +from pathlib import Path + +import pytest + +try: + from huggingface_hub import HfApi +except Exception: # pragma: no cover - if huggingface_hub missing, skip integration + HfApi = None + + +HF_INTEGRATION = os.environ.get("HF_INTEGRATION", "0").lower() in ("1", "true", "yes") +HF_TOKEN = os.environ.get("HF_TOKEN") +REPO_ID = os.environ.get("HF_INTEGRATION_REPO", "QMCSoftware/LDData") + + +@pytest.mark.skipif(not HF_INTEGRATION or HfApi is None, reason="Integration test disabled (set HF_INTEGRATION=1 and install huggingface_hub)") +def test_delete_and_upload_lattice_and_readme(): + """Integration test: delete remote dataset repo and upload only `lattice/` + `README.md`. + + WARNING: This test is destructive. It will delete the remote dataset repo + specified by `REPO_ID` and recreate it. Only enable it locally or in CI + when you explicitly want this behavior by setting `HF_INTEGRATION=1`. + """ + + if not HF_TOKEN: + pytest.skip("HF_TOKEN not provided; skipping destructive integration test") + + repo_root = Path(__file__).resolve().parents[2] + src_lattice = repo_root / "lattice" + src_readme = repo_root / "README.md" + + if not src_lattice.exists(): + pytest.skip("lattice/ folder not present in repo root; skipping") + + api = HfApi(token=HF_TOKEN) + + # Delete remote repo if it exists + try: + api.delete_repo(repo_id=REPO_ID, repo_type="dataset") + except Exception: + # ignore errors (repo may not exist or insufficient perms) + pass + + # Create the dataset repo (exist_ok=True will not fail if already present) + try: + api.create_repo(repo_id=REPO_ID, repo_type="dataset", exist_ok=True) + except Exception as exc: + pytest.fail(f"Failed to create dataset repo {REPO_ID}: {exc}") + + # Prepare a temporary folder containing only lattice/ and README.md + with tempfile.TemporaryDirectory() as td: + td_path = Path(td) + shutil.copytree(src_lattice, td_path / "lattice") + if src_readme.exists(): + shutil.copy(src_readme, td_path / "README.md") + + # Upload the folder contents to the dataset repo + try: + api.upload_folder(folder_path=str(td_path), repo_id=REPO_ID, repo_type="dataset") + except TypeError: + # Some versions of HfApi.upload_folder have a different signature + api.upload_folder(str(td_path), repo_id=REPO_ID, repo_type="dataset") + + # Verify that README.md exists in the remote dataset files + files = api.list_repo_files(REPO_ID, repo_type="dataset") + lowered = [f.lower() for f in files] + assert any(p.endswith("readme.md") for p in lowered), "README.md not found in uploaded dataset" diff --git a/tests/test_upload_unit.py b/tests/test_upload_unit.py new file mode 100644 index 0000000..c39c78d --- /dev/null +++ b/tests/test_upload_unit.py @@ -0,0 +1,96 @@ +import tempfile +import os +from pathlib import Path +import types + +import pytest + +from LDData import upload as up + + +class DummyAPI: + def __init__(self): + self.deleted = False + self.created = False + self.uploaded_folder = False + self.uploaded_large = False + self.uploaded_files = [] + + def delete_repo(self, **kwargs): + self.deleted = True + + def create_repo(self, **kwargs): + self.created = True + + def upload_folder(self, *args, **kwargs): + self.uploaded_folder = True + + def upload_large_folder(self, *args, **kwargs): + self.uploaded_large = True + + def upload_file(self, *args, **kwargs): + # record path_in_repo argument if present + if 'path_in_repo' in kwargs: + self.uploaded_files.append(kwargs['path_in_repo']) + else: + # some API variants use positional args; accept that case too + if len(args) >= 2: + self.uploaded_files.append(args[1]) + + +def test_delete_remote_repo_calls_delete_when_yes_true(): + api = DummyAPI() + # call with yes=True to avoid interactive prompt + up.delete_remote_repo(api, repo_id="owner/repo", token="tok", yes=True) + assert api.deleted is True + + +def test_perform_bulk_upload_uses_upload_folder(tmp_path, monkeypatch): + api = DummyAPI() + # create a small file so total_size < LARGE_THRESHOLD + d = tmp_path / "repo" + d.mkdir() + f = d / "a.txt" + f.write_text("hello") + + ok = up.perform_bulk_upload(api, local_path=d, repo_id="owner/repo", ignore_patterns=[], token="tok") + assert ok is True + assert api.uploaded_folder is True + + +def test_perform_bulk_upload_large_prefers_large_when_available(tmp_path): + api = DummyAPI() + # create a file > LARGE_THRESHOLD to trigger large upload path + d = tmp_path / "repo2" + d.mkdir() + big = d / "big.bin" + # write a file slightly larger than threshold (50 MiB) + big.write_bytes(b"0" * (50 * 1024 * 1024 + 10)) + + # monkeypatch attribute to simulate large upload existing + # DummyAPI already has upload_large_folder method + ok = up.perform_bulk_upload(api, local_path=d, repo_id="owner/repo", ignore_patterns=[], token="tok") + assert ok is True + assert api.uploaded_large is True + + +def test_perform_per_file_upload_calls_upload_file(tmp_path): + api = DummyAPI() + # create a couple of files list format expected by perform_per_file_upload: (fullpath, rel) + d = tmp_path / "repo3" + d.mkdir() + (d / "x.txt").write_text("x") + (d / "sub").mkdir() + (d / "sub" / "y.txt").write_text("y") + + files = [] + for root, _, fnames in os.walk(d): + for fn in fnames: + full = Path(root) / fn + rel = os.path.relpath(str(full), start=str(d)).replace(os.sep, '/') + files.append((full, rel)) + + up.perform_per_file_upload(api, files, repo_id="owner/repo", token="tok") + # uploaded_files should include both relative paths + assert any(p.endswith('x.txt') for p in api.uploaded_files) + assert any(p.endswith('sub/y.txt') for p in api.uploaded_files) diff --git a/upload.py b/upload.py new file mode 100644 index 0000000..7c0deae --- /dev/null +++ b/upload.py @@ -0,0 +1,401 @@ +#!/usr/bin/env python +""" +upload.py + +Upload the QMCSoftware/LDData repository (low discrepancy generating vectors +and matrices) to the Hugging Face Datasets Hub as a dataset repo. + +This script: + +1. Creates (or reuses) a dataset repo on the Hub. +2. Uploads all files from a local LDData checkout using `upload_folder`. +3. Leaves README.md in place so it becomes the dataset card. + After upload you can edit the card in the web UI to: + - Link to the paper (arXiv:2502.14256). + - Add "Citation" and "Uses" sections, similar to + - facebook/omnilingual-asr-corpus + - nvidia/PhysicalAI-Autonomous-Vehicles + - moondream/refcoco-m + +Requirements: + pip install "huggingface_hub>=0.32.0" + +Authentication: + - Either set HF_TOKEN in your environment: + export HF_TOKEN=hf_xxx... + OR pass --token on the command line. + +Example usage: + python upload.py \ + --repo-id QMCSoftware/LDData \ + --local-path /path/to/local/LDData + +After upload you’ll be able to do, e.g.: + + from datasets import load_dataset + ds = load_dataset("QMCSoftware/LDData") + +and link the dataset to your paper page on Hugging Face. +""" + +import argparse +import os +import sys +import time +import random +from pathlib import Path +import httpx +import fnmatch + +from huggingface_hub import HfApi, create_repo # type: ignore +from huggingface_hub.errors import HfHubHTTPError # type: ignore + + +def parse_args() -> argparse.Namespace: + parser = argparse.ArgumentParser( + description="Upload LDData to the Hugging Face Datasets Hub." + ) + parser.add_argument( + "--repo-id", + type=str, + default="QMCSoftware/LDData", + help="Target dataset repo id on the Hub (e.g. 'QMCSoftware/LDData').", + ) + parser.add_argument( + "--local-path", + type=str, + default=".", + help="Path to local LDData checkout (default: current directory).", + ) + parser.add_argument( + "--token", + type=str, + default=None, + help="Hugging Face access token. If omitted, HF_TOKEN env var is used.", + ) + parser.add_argument( + "--private", + action="store_true", + help="Create the dataset repo as private (default: public).", + ) + parser.add_argument( + "--dry-run", + action="store_true", + help="Do not upload, just print what would be done.", + ) + parser.add_argument( + "--reset-remote", + action="store_true", + help="Delete the remote dataset repo on the Hub before uploading (destructive).", + ) + parser.add_argument( + "--yes", + action="store_true", + help="Answer yes to any interactive confirmation prompts (use with care).", + ) + return parser.parse_args() + + +def get_token(cmd_token: str | None) -> str: + token = cmd_token or os.environ.get("HF_TOKEN") + if not token: + raise SystemExit( + "No token provided. Please either:\n" + " - set HF_TOKEN in your environment, or\n" + " - pass --token hf_xxx... on the command line." + ) + return token + + +def retry_call( + fn, + *args, + retries: int = 6, + base_delay: float = 1.0, + max_delay: float = 60.0, + allowed_status_for_retry=(429, 500, 502, 503, 504), + **kwargs, +): + """ + Call `fn(*args, **kwargs)` with retries on rate limit (429), server errors and read timeouts. + Uses exponential backoff with jitter. If the server provides a Retry-After header it will be honored. + """ + last_exc = None + for attempt in range(retries): + try: + return fn(*args, **kwargs) + except HfHubHTTPError as exc: + last_exc = exc + # Try to extract status code and headers robustly + status_code = getattr(exc, "status_code", None) + headers = {} + resp = getattr(exc, "response", None) + if resp is not None: + try: + status_code = getattr(resp, "status_code", status_code) + headers = getattr(resp, "headers", {}) or {} + except Exception as exc: + # Log the exception for debugging purposes but continue execution + print(f"WARNING: An unexpected error occurred: {exc!r}") + if status_code in allowed_status_for_retry: + # Honor Retry-After if present + retry_after = None + for key in ("retry-after", "Retry-After"): + if key in headers: + retry_after = headers.get(key) + break + if retry_after is not None: + try: + delay = int(retry_after) + except Exception: + try: + delay = float(retry_after) + except Exception: + delay = None + else: + delay = min(max_delay, base_delay * (2 ** attempt)) + delay += random.uniform(0, base_delay) + # On last attempt, break and raise + if attempt == retries - 1: + break + time.sleep(delay) + continue + # Non-retryable HTTP error + raise + except httpx.ReadTimeout as exc: + last_exc = exc + if attempt == retries - 1: + break + delay = min(max_delay, base_delay * (2 ** attempt)) + random.uniform(0, base_delay) + time.sleep(delay) + continue + except Exception as exc: + # For other exceptions, do not retry except maybe transient httpx.NetworkError -- keep simple and re-raise + last_exc = exc + raise + # If we exit loop without returning, raise last exception + if last_exc is not None: + raise last_exc + raise RuntimeError("retry_call failed without exception") + + +def create_api(token: str) -> HfApi: + """Create an authenticated HfApi client.""" + return HfApi(token=token) + + +def delete_remote_repo(api: HfApi, repo_id: str, token: str, yes: bool) -> None: + """Delete the remote dataset repo if requested. Non-fatal on errors.""" + if not yes: + resp = input( + f"Are you sure you want to DELETE the dataset repo '{repo_id}' on Hugging Face? This is irreversible. Type 'yes' to continue: " + ) + if resp.strip().lower() != "yes": + print("Aborting: remote reset cancelled by user.") + sys.exit(0) + + print(f"Deleting remote dataset repo '{repo_id}' (if it exists) on Hugging Face...") + try: + retry_call(getattr(api, "delete_repo"), repo_id=repo_id, repo_type="dataset", token=token) + print("Remote dataset repo deleted (or did not exist).") + except HfHubHTTPError as exc: + print() + print("WARNING: Failed to delete remote dataset repo:") + print(f" {exc!r}") + print("Continuing to (re)create the repository.") + except Exception as exc: + print() + print("WARNING: Unexpected error while attempting to delete remote repo:") + print(f" {exc!r}") + print("Continuing to (re)create the repository.") + + +def create_or_reuse_repo(repo_id: str, private: bool, token: str) -> None: + """Create (or reuse) the dataset repo on the Hub.""" + print(f"Creating (or reusing) dataset repo '{repo_id}' on the Hub...") + retry_call(create_repo, repo_id=repo_id, repo_type="dataset", private=private, exist_ok=True, token=token) + + +def build_ignore_checker(ignore_patterns): + def is_ignored(rel_path: str) -> bool: + for patt in ignore_patterns: + if fnmatch.fnmatch(rel_path, patt) or fnmatch.fnmatch("/" + rel_path, patt): + return True + return False + + return is_ignored + + +def gather_files(local_path: Path, ignore_patterns): + """Return a list of (full_path, rel_posix_path) and total size, skipping ignore patterns.""" + is_ignored = build_ignore_checker(ignore_patterns) + files_to_upload = [] + total_size = 0 + for root, _, files in os.walk(local_path): + for fname in files: + full = Path(root) / fname + rel = os.path.relpath(str(full), start=str(local_path)) + rel = rel.replace(os.sep, "/") + if rel.startswith("./"): + rel = rel[2:] + if rel.startswith("/"): + rel = rel.lstrip("/") + if is_ignored(rel): + continue + try: + sz = full.stat().st_size + except OSError: + sz = 0 + files_to_upload.append((full, rel)) + total_size += sz + return files_to_upload, total_size + + +def perform_bulk_upload(api: HfApi, local_path: Path, repo_id: str, ignore_patterns, token: str) -> bool: + """Attempt bulk upload via upload_large_folder or upload_folder. Returns True on success.""" + LARGE_THRESHOLD = 50 * 1024 * 1024 + # prefer upload_large_folder if folder is large and API provides it + files_to_upload, total_size = gather_files(local_path, ignore_patterns) + try: + if total_size > LARGE_THRESHOLD and hasattr(api, "upload_large_folder"): + try: + retry_call(getattr(api, "upload_large_folder"), folder_path=str(local_path), repo_id=repo_id, repo_type="dataset", ignore_patterns=ignore_patterns, token=token) + return True + except TypeError: + retry_call(getattr(api, "upload_large_folder"), folder_path=str(local_path), repo_id=repo_id, repo_type="dataset", token=token) + return True + except Exception as exc: + print() + print("ERROR: upload_large_folder failed:") + print(f" {exc!r}") + print("Falling back to per-file upload...") + return False + else: + retry_call(getattr(api, "upload_folder"), folder_path=str(local_path), repo_id=repo_id, repo_type="dataset", ignore_patterns=ignore_patterns) + return True + except httpx.ReadTimeout: + print() + print("ERROR: Upload read timed out.") + print("Possible actions:") + print(" - Check your network connection and try again.") + print(" - Try uploading in smaller batches (split large files or directories).") + print(" - Use `HfApi().upload_large_folder(...)` or the CLI `hf upload-large-folder` if available.") + print(" - Upgrade huggingface_hub to the latest version in case it adds improved timeout handling.") + print(" - If you have very large files, consider using git-lfs or the web UI.") + sys.exit(1) + except TypeError as exc: + print() + print("ERROR: Upload function raised a TypeError (likely a signature mismatch):") + print(f" {exc!r}") + print("Falling back to per-file upload...") + return False + except HfHubHTTPError as exc: + print() + print("ERROR: HF Hub returned an HTTP error while attempting to upload:") + print(f" {exc!r}") + print("If this is a rate limit (429) you may need to wait, reduce request rate, or upgrade your plan.") + sys.exit(1) + except Exception: + return False + + +def perform_per_file_upload(api: HfApi, files_to_upload, repo_id: str, token: str) -> None: + """Upload files one-by-one as a fallback.""" + if not files_to_upload: + return + print("Falling back to per-file upload (this is slower but more robust for flaky networks)...") + for idx, (file_path, rel) in enumerate(files_to_upload): + path_in_repo = rel.lstrip("/") + success = False + try: + retry_call(getattr(api, "upload_file"), path_or_fileobj=str(file_path), path_in_repo=path_in_repo, repo_id=repo_id, repo_type="dataset", token=token) + success = True + except httpx.ReadTimeout: + print() + print("ERROR: Per-file upload read timed out on:", path_in_repo) + print("You can retry this script or use `hf upload-large-folder` / `HfApi.upload_large_folder`.") + sys.exit(1) + except HfHubHTTPError as exc: + print() + print("ERROR: Failed to upload file due to HF Hub HTTP error:", path_in_repo) + print(f" {exc!r}") + except Exception as exc: + print(f"WARNING: Failed to upload {path_in_repo!s}: {exc!r}") + if not success: + print(f"Failed to upload: {path_in_repo}") + time.sleep(0.1 + random.uniform(0, 0.05)) + + +def main() -> None: + args = parse_args() + + local_path = Path(args.local_path).expanduser().resolve() + if not local_path.exists(): + raise SystemExit(f"Local path does not exist: {local_path}") + + readme = local_path / "README.md" + if not readme.exists(): + print(f"WARNING: {readme} does not exist. Are you sure this is the LDData repo root?", file=sys.stderr) + + token = get_token(args.token) + repo_id = args.repo_id + + print(f"Using repo_id: {repo_id}") + print(f"Local path : {local_path}") + print(f"Private : {args.private}") + if args.dry_run: + print("Dry run enabled: NOT creating or uploading, just showing intent.") + return + + api = create_api(token) + + if args.reset_remote: + delete_remote_repo(api, repo_id, token, args.yes) + + create_or_reuse_repo(repo_id, args.private, token) + + ignore_patterns = [ + "sc/*", + ".git/*", + ".gitignore", + ".DS_Store", + "__pycache__/*", + "*.pyc", + "*.pyo", + "*~", + "*.ipynb_checkpoints*", + "raw.githubusercontent.com", + ] + + print("Uploading local folder to the Hub (this may take a while)...") + + # gather files in case we need per-file fallback + files_to_upload, _ = gather_files(local_path, ignore_patterns) + + bulk_ok = perform_bulk_upload(api, local_path, repo_id, ignore_patterns, token) + if bulk_ok: + print("Bulk upload succeeded — skipping per-file fallback.") + else: + perform_per_file_upload(api, files_to_upload, repo_id, token) + + dataset_url = f"https://huggingface.co/datasets/{repo_id}" + print() + print("✅ Upload complete.") + print(f"Dataset is now available at: {dataset_url}") + print() + print("Next steps (recommended):") + print(" 1. Open the dataset page above in your browser.") + print(" 2. Edit the Dataset Card (README.md) to:") + print(" - Add paper links (e.g., your QMCSoftware/LDData arXiv paper).") + print(" - Add a 'Citations' section.") + print(" - Add 'Uses' and 'Limitations' sections, similar to:") + print(" - facebook/omnilingual-asr-corpus") + print(" - nvidia/PhysicalAI-Autonomous-Vehicles") + print(" - moondream/refcoco-m") + print(" 3. Use 'Paper' / 'Dataset' linking in the Hugging Face UI to") + print(" attach the dataset to your paper so it shows up on the") + print(" paper page and in discovery views.") + + +if __name__ == "__main__": + main() \ No newline at end of file