diff --git a/.github/workflows/python-publish.yml b/.github/workflows/python-publish.yml index aa608ef2d..af9c71f80 100644 --- a/.github/workflows/python-publish.yml +++ b/.github/workflows/python-publish.yml @@ -1,10 +1,14 @@ -name: Build and Publish Python Package +name: Build and Publish self_encryption Python Package on: push: tags: - 'v*' +env: + PYTHON_VERSION: "3.10" + PACKAGE_NAME: "self_encryption" + # Add top-level permissions block permissions: id-token: write @@ -30,12 +34,12 @@ jobs: uses: PyO3/maturin-action@v1 with: target: ${{ matrix.target }} - args: --release --out dist + args: --release --out dist -i python${{ matrix.python-version }} sccache: 'true' - name: Upload wheels - uses: actions/upload-artifact@v3 + uses: actions/upload-artifact@v4 with: - name: wheels + name: wheels-macos-${{ matrix.target }}-py${{ matrix.python-version }} path: dist/*.whl if-no-files-found: error @@ -58,47 +62,240 @@ jobs: - name: Build wheels uses: PyO3/maturin-action@v1 with: - args: --release --out dist + args: --release --out dist -i python${{ matrix.python-version }} sccache: 'true' - name: Upload wheels - uses: actions/upload-artifact@v3 + uses: actions/upload-artifact@v4 with: - name: wheels + name: wheels-windows-${{ matrix.target }}-py${{ matrix.python-version }} path: dist/*.whl if-no-files-found: error linux: runs-on: ubuntu-latest - # Add permissions to job - permissions: - id-token: write - contents: read strategy: matrix: + target: [x86_64, i686] python-version: ["3.8", "3.9", "3.10", "3.11", "3.12"] - target: [x86_64] steps: - - uses: actions/checkout@v4 + - uses: actions/checkout@v3 - uses: actions/setup-python@v4 with: python-version: ${{ matrix.python-version }} - - name: Install dependencies - run: | - python -m pip install --user cffi - python -m pip install --user patchelf + architecture: x64 - name: Build wheels uses: PyO3/maturin-action@v1 + env: + PYTHON_VERSION: ${{ matrix.python-version }} with: target: ${{ matrix.target }} manylinux: auto - args: --release --out dist - sccache: 'true' + args: --release --out dist -i python${{ matrix.python-version }} + - name: Install built wheel + if: matrix.target == 'x86_64' + run: | + # List all wheels to debug + echo "Available wheels:" + ls -la dist/ + + pip install -U pip pytest click>=8.0.0 + pip install --find-links dist/ ${{ env.PACKAGE_NAME }} + pytest -v - name: Upload wheels - uses: actions/upload-artifact@v3 + uses: actions/upload-artifact@v4 with: - name: wheels - path: dist/*.whl - if-no-files-found: error + name: wheels-linux-${{ matrix.target }}-py${{ matrix.python-version }} + path: dist + + # linux-cross: + # runs-on: ubuntu-latest + # strategy: + # matrix: + # target: [aarch64, armv7, s390x, ppc64le, ppc64] + # python-version: ["3.8", "3.9", "3.10", "3.11", "3.12"] + # steps: + # - uses: actions/checkout@v3 + # - name: Set up QEMU + # uses: docker/setup-qemu-action@v3 + # - name: Set up Docker Buildx + # uses: docker/setup-buildx-action@v3 + # - uses: actions/setup-python@v4 + # with: + # python-version: ${{ matrix.python-version }} + # - name: Build wheels + # uses: PyO3/maturin-action@v1 + # env: + # PYO3_CROSS_PYTHON_VERSION: ${{ matrix.python-version }} + # PYO3_CROSS: "1" + # PYTHON_VERSION: ${{ matrix.python-version }} + # with: + # target: ${{ matrix.target }} + # manylinux: auto + # args: --release --out dist -i python${{ matrix.python-version }} + # sccache: 'true' + # docker-options: >- + # --platform ${{ matrix.target == 'armv7' && 'linux/arm/v7' || + # matrix.target == 'aarch64' && 'linux/arm64' || + # matrix.target == 's390x' && 'linux/s390x' || + # matrix.target == 'ppc64le' && 'linux/ppc64le' || + # 'linux/amd64' }} + # - uses: uraimo/run-on-arch-action@v2.7.0 + # if: matrix.target != 'ppc64' + # name: Install built wheel + # with: + # arch: ${{ matrix.target }} + # distro: ubuntu_latest + # githubToken: ${{ github.token }} + # install: | + # apt-get update + # apt-get install -y --no-install-recommends python3 python3-pip python3-venv + # pip3 install -U pip pytest click>=8.0.0 + # run: | + # # List all wheels to debug + # echo "Available wheels:" + # ls -la dist/ + + # # Install using find-links to handle platform tags correctly + # pip3 install --find-links dist/ ${{ env.PACKAGE_NAME }} + # pytest -v + # - name: Upload wheels + # uses: actions/upload-artifact@v4 + # with: + # name: wheels-linux-cross-${{ matrix.target }}-py${{ matrix.python-version }} + # path: dist + + musllinux: + runs-on: ubuntu-latest + strategy: + matrix: + target: + - x86_64-unknown-linux-musl + - i686-unknown-linux-musl + python-version: ["3.8", "3.9", "3.10", "3.11", "3.12"] + steps: + - uses: actions/checkout@v3 + - uses: actions/setup-python@v4 + with: + python-version: ${{ matrix.python-version }} + architecture: x64 + - name: Build wheels + uses: PyO3/maturin-action@v1 + env: + PYO3_CROSS_PYTHON_VERSION: ${{ matrix.python-version }} + PYO3_CROSS: "1" + with: + target: ${{ matrix.target }} + manylinux: musllinux_1_2 + args: --release --out dist -i python${{ matrix.python-version }} + - name: Install built wheel + if: matrix.target == 'x86_64-unknown-linux-musl' + uses: addnab/docker-run-action@v3 + with: + image: alpine:latest + options: -v ${{ github.workspace }}:/io -w /io + run: | + # Install system dependencies + apk add --no-cache \ + python3 \ + py3-pip \ + gcc \ + musl-dev \ + python3-dev \ + rust \ + cargo \ + openssl-dev \ + pkgconfig + + # Create and activate virtual environment + python3 -m venv /venv + . /venv/bin/activate + + # Install Python dependencies + pip install --upgrade pip wheel setuptools + pip install pytest click>=8.0.0 + + # Install in development mode + cd /io + pip install -e . + + # Run tests + python -m pytest -v + - name: Upload wheels + uses: actions/upload-artifact@v4 + with: + name: wheels-musllinux-${{ matrix.target }}-py${{ matrix.python-version }} + path: dist + + # musllinux-cross: + # runs-on: ubuntu-latest + # strategy: + # matrix: + # platform: + # - target: aarch64-unknown-linux-musl + # arch: aarch64 + # platform: linux/arm64 + # - target: armv7-unknown-linux-musleabihf + # arch: armv7 + # platform: linux/arm/v7 + # python-version: ["3.8", "3.9", "3.10", "3.11", "3.12"] + # steps: + # - uses: actions/checkout@v3 + # - name: Set up QEMU + # uses: docker/setup-qemu-action@v3 + # - name: Set up Docker Buildx + # uses: docker/setup-buildx-action@v3 + # - uses: actions/setup-python@v4 + # with: + # python-version: ${{ matrix.python-version }} + # - name: Build wheels + # uses: PyO3/maturin-action@v1 + # env: + # PYO3_CROSS_PYTHON_VERSION: ${{ matrix.python-version }} + # PYO3_CROSS: "1" + # with: + # target: ${{ matrix.platform.target }} + # manylinux: musllinux_1_2 + # args: --release --out dist -i python${{ matrix.python-version }} + # sccache: 'true' + # docker-options: "--platform ${{ matrix.platform.platform }}" + # - uses: uraimo/run-on-arch-action@master + # name: Install built wheel + # with: + # arch: ${{ matrix.platform.arch }} + # distro: alpine_latest + # githubToken: ${{ github.token }} + # install: | + # apk add --no-cache \ + # python3 \ + # py3-pip \ + # gcc \ + # musl-dev \ + # python3-dev \ + # rust \ + # cargo \ + # openssl-dev \ + # pkgconfig + # run: | + # # Create and activate virtual environment + # python3 -m venv /venv + # . /venv/bin/activate + + # # Install Python dependencies + # pip install --upgrade pip wheel setuptools + # pip install pytest click>=8.0.0 + + # # Install in development mode + # cd /io + # pip install -e . + + # # Run tests + # python -m pytest -v + # - name: Upload wheels + # uses: actions/upload-artifact@v4 + # with: + # name: wheels-musllinux-cross-${{ matrix.platform.target }}-py${{ matrix.python-version }} + # path: dist + sdist: runs-on: ubuntu-latest @@ -114,9 +311,9 @@ jobs: command: sdist args: --out dist - name: Upload sdist - uses: actions/upload-artifact@v3 + uses: actions/upload-artifact@v4 with: - name: wheels + name: wheels-sdist-py${{ env.PYTHON_VERSION }} path: dist/*.tar.gz if-no-files-found: error @@ -131,8 +328,18 @@ jobs: steps: - uses: actions/download-artifact@v3 with: - name: wheels + name: wheels-* path: dist + + # Add additional steps to download each artifact type + - uses: actions/download-artifact@v3 + with: + name: wheels-sdist-py${{ env.PYTHON_VERSION }} + path: dist + + # We might need multiple download steps or a different approach + # to combine all artifacts + - name: Display structure of downloaded files run: ls -R dist - name: Publish to PyPI @@ -140,4 +347,4 @@ jobs: with: packages-dir: dist/ verbose: true - print-hash: true \ No newline at end of file + print-hash: true diff --git a/Cargo.toml b/Cargo.toml index 8afad72fb..5411670d8 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -12,7 +12,10 @@ version = "0.32.4" [features] default = [] -python = ["pyo3/extension-module"] +python = [ + "pyo3/extension-module", + "serde_json" +] [dependencies] aes = "~0.8.1" @@ -27,7 +30,8 @@ num_cpus = "1.13.0" itertools = "~0.10.0" tempfile = "3.6.0" xor_name = "5.0.0" -pyo3 = { version = "0.19", optional = true, features = ["extension-module"] } +pyo3 = { version = "=0.20.3", optional = true, features = ["extension-module"] } +serde_json = { version = "1.0", optional = true } [dependencies.brotli] version = "~3.3.0" @@ -57,6 +61,7 @@ features = ["rt"] [dev-dependencies] criterion = "~0.3" docopt = "~0.9.0" +clap = { version = "4.4", features = ["derive"] } [dev-dependencies.tokio] version = "1.34.0" diff --git a/examples/parallel_streaming_decryptor.rs b/examples/parallel_streaming_decryptor.rs index 24751a36e..6881519c3 100644 --- a/examples/parallel_streaming_decryptor.rs +++ b/examples/parallel_streaming_decryptor.rs @@ -3,18 +3,88 @@ use rayon::prelude::*; use self_encryption::{deserialize, streaming_decrypt_from_storage, DataMap, Error, Result}; use std::{fs::File, io::Read, path::Path}; use xor_name::XorName; +use clap::{Parser, error::ErrorKind}; + +/// Parallel streaming decryptor for self-encrypted files +#[derive(Parser, Debug)] +#[command(author, version, about, long_about = None)] +struct Args { + /// Path to the data map file + #[arg(short, long, required = true)] + data_map: String, + + /// Directory containing the encrypted chunks + #[arg(short, long, required = true)] + chunks_dir: String, + + /// Path where the decrypted file should be written + #[arg(short, long, required = true)] + output: String, +} + +fn validate_paths(args: &Args) -> Result<()> { + // Check data map file exists and is readable + if !Path::new(&args.data_map).exists() { + return Err(Error::Generic(format!( + "Data map file does not exist: {}", + args.data_map + ))); + } + + // Check chunks directory exists and is readable + let chunks_dir = Path::new(&args.chunks_dir); + if !chunks_dir.exists() { + return Err(Error::Generic(format!( + "Chunks directory does not exist: {}", + args.chunks_dir + ))); + } + if !chunks_dir.is_dir() { + return Err(Error::Generic(format!( + "Chunks path is not a directory: {}", + args.chunks_dir + ))); + } + + // Check output parent directory exists and is writable + let output_path = Path::new(&args.output); + if let Some(parent) = output_path.parent() { + if !parent.exists() { + return Err(Error::Generic(format!( + "Output directory does not exist: {}", + parent.display() + ))); + } + // Try to verify write permissions + if !parent.metadata() + .map(|m| m.permissions().readonly()) + .unwrap_or(true) + { + return Err(Error::Generic(format!( + "Output directory is not writable: {}", + parent.display() + ))); + } + } + + Ok(()) +} fn main() -> Result<()> { - // Load the data map from file or another source - let data_map = load_data_map("path/to/data_map")?; + let args = Args::parse(); + + // Validate all paths before proceeding + validate_paths(&args)?; + + // Load the data map from file + let data_map = load_data_map(&args.data_map)?; // Implement the parallel chunk retrieval function let get_chunk_parallel = |hashes: &[XorName]| -> Result> { hashes .par_iter() .map(|hash| { - // Simulate network retrieval with local file read - let chunk_path = Path::new("chunks").join(hex::encode(hash)); + let chunk_path = Path::new(&args.chunks_dir).join(hex::encode(hash)); let mut chunk_data = Vec::new(); File::open(&chunk_path) .and_then(|mut file| file.read_to_end(&mut chunk_data)) @@ -25,7 +95,9 @@ fn main() -> Result<()> { }; // Use the streaming decryption function - streaming_decrypt_from_storage(&data_map, Path::new("output_file.dat"), get_chunk_parallel)?; + streaming_decrypt_from_storage(&data_map, Path::new(&args.output), get_chunk_parallel)?; + + println!("Successfully decrypted file to: {}", args.output); Ok(()) } diff --git a/pyproject.toml b/pyproject.toml index 8a27e8310..2e12968a0 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -4,27 +4,33 @@ build-backend = "maturin" [project] name = "self_encryption" -requires-python = ">=3.7" +version = "0.32.4" +description = "Self encrypting files (convergent encryption plus obfuscation)" +authors = [ + {name = "MaidSafe Developers", email = "dev@maidsafe.net"} +] +requires-python = ">=3.8" +license = {text = "GPL-3.0"} classifiers = [ - "Programming Language :: Python :: 3", "Programming Language :: Rust", - "Operating System :: OS Independent", + "Programming Language :: Python :: Implementation :: CPython", + "Programming Language :: Python :: 3.8", + "Programming Language :: Python :: 3.9", + "Programming Language :: Python :: 3.10", + "Programming Language :: Python :: 3.11", + "Programming Language :: Python :: 3.12", ] dependencies = [ - "pip>=24.0", - "pytest>=7.4.4", "click>=8.0.0", + "pip>=24.3.1", ] -[project.scripts] -self-encryption = "self_encryption.cli:cli" - [tool.maturin] features = ["python"] module-name = "self_encryption._self_encryption" -bindings = "pyo3" -develop = true +python-packages = ["self_encryption"] +include = ["self_encryption/**/*"] +manifest-path = "Cargo.toml" -[tool.pytest.ini_options] -testpaths = ["tests"] -python_files = ["test_*.py"] +[project.scripts] +self-encryption = "self_encryption.cli:cli" diff --git a/self_encryption/__init__.py b/self_encryption/__init__.py index d875eea33..ae3927db4 100644 --- a/self_encryption/__init__.py +++ b/self_encryption/__init__.py @@ -1,145 +1,32 @@ -""" -self_encryption - A convergent encryption library with obfuscation - -This library provides a secure way to encrypt data that supports deduplication while -maintaining strong security through content obfuscation and chunk interdependencies. - -Key Features: - - Content-based chunking for deduplication - - Convergent encryption with obfuscation - - Self-validating chunks through content hashing - - Streaming operations for large files - - Parallel chunk processing - - Both in-memory and file-based operations - - Command-line interface for all operations - -Basic Usage: - >>> from self_encryption import encrypt, decrypt - >>> data = b"Hello, World!" * 1000 # Must be at least 3072 bytes - >>> data_map, chunks = encrypt(data) - >>> decrypted = decrypt(data_map, chunks) - >>> assert data == decrypted - -File Operations: - >>> from pathlib import Path - >>> from self_encryption import encrypt_from_file, decrypt_from_storage - >>> data_map, chunk_names = encrypt_from_file("input.dat", "chunks/") - >>> def get_chunk(hash_hex): - ... return (Path("chunks") / hash_hex).read_bytes() - >>> decrypt_from_storage(data_map, "output.dat", get_chunk) - -Streaming Operations: - >>> from self_encryption import streaming_encrypt_from_file - >>> def store_chunk(name, content): - ... (Path("chunks") / name).write_bytes(content) - >>> data_map = streaming_encrypt_from_file("large_file.dat", store_chunk) - >>> print(f"Created {data_map.len()} chunks") - -Command Line Usage: - The library includes a command-line interface for all operations: - - # Encrypt a file - $ self-encryption encrypt-file input.dat chunks/ - - # Decrypt a file - $ self-encryption decrypt-file data_map.json chunks/ output.dat - - # Verify a chunk - $ self-encryption verify chunks/abc123.dat - - # Shrink a data map - $ self-encryption shrink data_map.json chunks/ optimized_map.json - - For more information about CLI commands: - $ self-encryption --help - -Classes: - DataMap - Contains metadata about encrypted chunks - Methods: - new(chunk_infos) -> DataMap - with_child(chunk_infos, child) -> DataMap - child() -> Optional[int] - is_child() -> bool - len() -> int - infos() -> List[Tuple[int, bytes, bytes, int]] - - EncryptedChunk - Represents an encrypted chunk of data - Methods: - new(content: bytes) -> EncryptedChunk - content() -> bytes - from_bytes(content: bytes) -> EncryptedChunk - - XorName - Content-addressed names for chunks - Methods: - new(bytes) -> XorName - from_content(content) -> XorName - as_bytes() -> bytes - -Functions: - encrypt(data: bytes) -> Tuple[DataMap, List[EncryptedChunk]] - Encrypt data in memory, returning a data map and encrypted chunks. - The input data must be at least 3072 bytes. - - encrypt_from_file(input_path: str, output_dir: str) -> Tuple[DataMap, List[str]] - Encrypt a file and store chunks to disk. Returns a data map and chunk names. - The input file must be at least 3072 bytes. - - streaming_encrypt_from_file(input_path: str, store_chunk: Callable[[str, bytes], None]) -> DataMap - Stream-encrypt a file and store chunks using a custom storage backend. - Memory efficient for large files. Returns only the data map. - - decrypt(data_map: DataMap, chunks: List[EncryptedChunk]) -> bytes - Decrypt data using provided chunks in memory. - - decrypt_from_storage(data_map: DataMap, output_path: str, get_chunk: Callable) -> None - Decrypt data using chunks from storage, writing directly to a file. - Suitable for files that can fit in memory. - - streaming_decrypt_from_storage(data_map: DataMap, output_path: str, get_chunks: Callable) -> None - Decrypt data using parallel chunk retrieval for improved performance. - Optimized for large files and remote storage backends. - Retrieves multiple chunks in parallel for better throughput. - - shrink_data_map(data_map: DataMap, store_chunk: Callable) -> Tuple[DataMap, List[EncryptedChunk]] - Shrink a data map by recursively encrypting it. Useful for large files. - - verify_chunk(name: XorName, content: bytes) -> EncryptedChunk - Verify the integrity of an encrypted chunk. - -For more detailed documentation about specific functions or classes: - >>> help(self_encryption.DataMap) - >>> help(self_encryption.encrypt) -""" - -from ._self_encryption import ( - DataMap, - EncryptedChunk, - XorName, - encrypt, - encrypt_from_file, - decrypt, - decrypt_from_storage, - shrink_data_map, - streaming_decrypt_from_storage, - verify_chunk, - streaming_encrypt_from_file, -) - -from .cli import cli - -__version__ = "0.32.2" +try: + from ._self_encryption import ( + PyDataMap as DataMap, + PyXorName as XorName, + EncryptResult, + encrypt_from_file, + decrypt_from_storage, + streaming_decrypt_from_storage, + MIN_CHUNK_SIZE, + MIN_ENCRYPTABLE_BYTES, + MAX_CHUNK_SIZE, + COMPRESSION_QUALITY, + ) + from .cli import cli +except ImportError as e: + import sys + print(f"Error importing self_encryption: {e}", file=sys.stderr) + raise __all__ = [ - "DataMap", - "EncryptedChunk", - "XorName", - "encrypt", - "encrypt_from_file", - "decrypt", - "decrypt_from_storage", - "shrink_data_map", - "streaming_decrypt_from_storage", - "verify_chunk", - "streaming_encrypt_from_file", - "cli", + 'DataMap', + 'XorName', + 'EncryptResult', + 'encrypt_from_file', + 'decrypt_from_storage', + 'streaming_decrypt_from_storage', + 'MIN_CHUNK_SIZE', + 'MIN_ENCRYPTABLE_BYTES', + 'MAX_CHUNK_SIZE', + 'COMPRESSION_QUALITY', + 'cli', ] \ No newline at end of file diff --git a/self_encryption/cli.py b/self_encryption/cli.py index 4695094cb..792bf7804 100644 --- a/self_encryption/cli.py +++ b/self_encryption/cli.py @@ -12,17 +12,12 @@ from typing import Optional import sys -from self_encryption import ( - DataMap, - EncryptedChunk, - XorName, - encrypt, +from ._self_encryption import ( + PyDataMap as DataMap, + PyXorName as XorName, encrypt_from_file, - decrypt, decrypt_from_storage, - shrink_data_map, streaming_decrypt_from_storage, - verify_chunk, ) def print_error(message: str): @@ -55,11 +50,9 @@ def encrypt_file(input_file: str, output_dir: str, json: bool): $ self-encryption encrypt-file input.dat chunks/ """ try: - data_map, chunk_names = encrypt_from_file(input_file, output_dir) - if json: - click.echo(data_map.to_json()) - else: - click.echo(str(data_map)) + result = encrypt_from_file(input_file, output_dir) + data_map = result.data_map + click.echo(data_map.to_json()) except Exception as e: print_error(str(e)) sys.exit(1) @@ -67,8 +60,8 @@ def encrypt_file(input_file: str, output_dir: str, json: bool): @cli.command() @click.argument('data-map-file', type=click.Path(exists=True, dir_okay=False)) @click.argument('chunks-dir', type=click.Path(exists=True, file_okay=False)) -@click.argument('output-file', type=click.Path()) -@click.option('--streaming', is_flag=True, help='Use streaming decryption for large files') +@click.argument('output-file', type=click.Path(dir_okay=False)) +@click.option('--streaming', is_flag=True, help='Use streaming decryption') def decrypt_file(data_map_file: str, chunks_dir: str, output_file: str, streaming: bool): """ Decrypt a file using its data map and stored chunks. @@ -81,21 +74,17 @@ def decrypt_file(data_map_file: str, chunks_dir: str, output_file: str, streamin """ try: # Read data map from file - with open(data_map_file, 'r') as f: - data_map = DataMap.from_json(f.read()) - - chunks_path = Path(chunks_dir) - - def get_chunk(hash_hex: str) -> bytes: - chunk_path = chunks_path / hash_hex - if not chunk_path.exists(): - raise click.ClickException(f"Chunk not found: {hash_hex}") - return chunk_path.read_bytes() + data_map_str = Path(data_map_file).read_text() + try: + data_map = DataMap.from_json(data_map_str) + except Exception as e: + print_error(f"Failed to parse data map: {e}") + sys.exit(1) if streaming: - streaming_decrypt_from_storage(data_map, output_file, get_chunk) + streaming_decrypt_from_storage(data_map, output_file, chunks_dir) else: - decrypt_from_storage(data_map, output_file, get_chunk) + decrypt_from_storage(data_map, output_file, chunks_dir) except Exception as e: print_error(str(e)) diff --git a/src/encrypt.rs b/src/encrypt.rs index dc69b8295..edf640a80 100644 --- a/src/encrypt.rs +++ b/src/encrypt.rs @@ -113,13 +113,14 @@ pub(crate) fn encrypt_chunk(content: Bytes, pki: (Pad, Key, Iv)) -> Result, -) -> Result<(DataMap, Vec)> { - // Create a sorted vector of all hashes +) -> Result { + // Create a sorted vector of all hashes - we still need this for encryption let src_hashes: Vec<_> = chunks.iter().map(|c| c.hash).collect(); + let mut keys = Vec::with_capacity(chunks.len()); // First, process chunks 2 onwards in parallel since they only need their previous two hashes let later_chunks: Vec<_> = chunks.iter().skip(2).collect(); - let (mut keys, mut encrypted_chunks): (Vec, Vec) = later_chunks + let later_chunk_infos: Vec = later_chunks .into_par_iter() .map(|chunk| { let RawChunk { index, data, hash } = chunk; @@ -129,21 +130,16 @@ pub(crate) fn encrypt_stream( let encrypted_content = encrypt_chunk(data.clone(), pki)?; let dst_hash = XorName::from_content(encrypted_content.as_ref()); - Ok(( - ChunkInfo { - index: *index, - dst_hash, - src_hash: *hash, - src_size, - }, - EncryptedChunk { - content: encrypted_content, - }, - )) + Ok(ChunkInfo { + index: *index, + dst_hash, + src_hash: *hash, + src_size, + }) }) - .collect::>>()? - .into_iter() - .unzip(); + .collect::>>()?; + + keys.extend(later_chunk_infos); // Process chunk 1 (needs hash 0 and last hash) let chunk = &chunks[1]; @@ -161,12 +157,6 @@ pub(crate) fn encrypt_stream( src_size: chunk.data.len(), }, ); - encrypted_chunks.insert( - 0, - EncryptedChunk { - content: encrypted_content, - }, - ); // Process chunk 0 (needs last two hashes) let chunk = &chunks[0]; @@ -184,12 +174,6 @@ pub(crate) fn encrypt_stream( src_size: chunk.data.len(), }, ); - encrypted_chunks.insert( - 0, - EncryptedChunk { - content: encrypted_content, - }, - ); - Ok((DataMap::new(keys), encrypted_chunks)) + Ok(DataMap::new(keys)) } diff --git a/src/lib.rs b/src/lib.rs index 9e848a41e..f2bb9436c 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -68,9 +68,8 @@ unused_comparisons, unused_features, unused_parens, - while_true, - warnings )] +#![cfg_attr(not(feature = "python"), deny(warnings))] #![warn( trivial_casts, trivial_numeric_casts, @@ -120,7 +119,7 @@ pub use bytes; pub use xor_name; /// The minimum size (before compression) of data to be self-encrypted, defined as 3B. -pub const MIN_ENCRYPTABLE_BYTES: usize = 3 * MIN_CHUNK_SIZE; +pub const MIN_ENCRYPTABLE_BYTES: usize = 3; /// The default maximum size (before compression) of an individual chunk of a file, defaulting as 1MiB. const DEFAULT_MAX_CHUNK_SIZE: usize = 1024 * 1024; @@ -532,7 +531,7 @@ where // Create a buffered reader with a reasonable buffer size let mut reader = BufReader::with_capacity(1024 * 1024, file); - + // Read all chunks first to get their hashes let mut chunks = Vec::with_capacity(num_chunks); for chunk_index in 0..num_chunks { @@ -540,7 +539,7 @@ where let chunk_size = end - start; let mut chunk_data = vec![0u8; chunk_size]; reader.read_exact(&mut chunk_data)?; - + let hash = XorName::from_content(&chunk_data); chunks.push(crate::chunk::RawChunk { index: chunk_index, @@ -548,15 +547,20 @@ where hash, }); } - - // Process chunks in the correct order - let (data_map, encrypted_chunks) = encrypt::encrypt_stream(chunks)?; - - // Store all encrypted chunks - for chunk in encrypted_chunks { - chunk_store(XorName::from_content(&chunk.content), chunk.content)?; + + // Process chunks and store them immediately + let data_map = encrypt::encrypt_stream(chunks.clone())?; + + // Now encrypt and store each chunk + let src_hashes: Vec<_> = chunks.iter().map(|c| c.hash).collect(); + + for chunk in chunks { + let pki = get_pad_key_and_iv(chunk.index, &src_hashes); + let encrypted_content = encrypt::encrypt_chunk(chunk.data, pki)?; + let hash = XorName::from_content(&encrypted_content); + chunk_store(hash, encrypted_content)?; } - + // Shrink the data map and store additional chunks if needed let (shrunk_data_map, _) = shrink_data_map(data_map, |hash, content| { chunk_store(hash, content)?; @@ -619,10 +623,7 @@ where .iter() .map(|info| { let content = chunk_cache.get(&info.dst_hash).ok_or_else(|| { - Error::Generic(format!( - "Chunk not found for hash: {:?}", - info.dst_hash - )) + Error::Generic(format!("Chunk not found for hash: {:?}", info.dst_hash)) })?; Ok(EncryptedChunk { content: content.clone(), @@ -751,10 +752,10 @@ mod tests { std::fs::write(&file_path, small_data)?; let store = |_: XorName, _: Bytes| -> Result<()> { Ok(()) }; - + let result = streaming_encrypt_from_file(&file_path, store); assert!(result.is_err()); - + Ok(()) } @@ -768,7 +769,7 @@ mod tests { let storage = Arc::new(Mutex::new(HashMap::new())); let storage_clone = storage.clone(); - + let store = move |hash: XorName, content: Bytes| -> Result<()> { let _ = storage_clone.lock().unwrap().insert(hash, content.to_vec()); Ok(()) @@ -776,7 +777,7 @@ mod tests { let data_map = streaming_encrypt_from_file(&file_path, store)?; assert!(data_map.chunk_identifiers.len() >= 3); - + Ok(()) } @@ -792,7 +793,7 @@ mod tests { let storage = Arc::new(Mutex::new(HashMap::new())); let storage_clone = storage.clone(); - + let store = move |hash: XorName, content: Bytes| -> Result<()> { let _ = storage_clone.lock().unwrap().insert(hash, content.to_vec()); Ok(()) @@ -801,16 +802,22 @@ mod tests { // First get the number of chunks directly let bytes = Bytes::from(large_data.clone()); let (num_chunks, _) = chunk::batch_chunks(bytes); - assert!(num_chunks > 3, - "Should have more than 3 chunks before shrinking. Got: {}", num_chunks); + assert!( + num_chunks > 3, + "Should have more than 3 chunks before shrinking. Got: {}", + num_chunks + ); // Now test the streaming encryption let data_map = streaming_encrypt_from_file(&file_path, store)?; - + // After shrinking, should be exactly 3 chunks - assert_eq!(data_map.chunk_identifiers.len(), 3, - "Final data map should have exactly 3 chunks after shrinking"); - + assert_eq!( + data_map.chunk_identifiers.len(), + 3, + "Final data map should have exactly 3 chunks after shrinking" + ); + // Verify chunk indices are sequential let mut prev_index = None; for chunk_info in &data_map.chunk_identifiers { @@ -819,7 +826,7 @@ mod tests { } prev_index = Some(chunk_info.index); } - + Ok(()) } @@ -846,7 +853,7 @@ mod tests { let result = streaming_encrypt_from_file(&file_path, store); assert!(result.is_err()); - + Ok(()) } @@ -860,7 +867,7 @@ mod tests { let storage = Arc::new(Mutex::new(HashMap::new())); let storage_clone = storage.clone(); - + let store = move |hash: XorName, content: Bytes| -> Result<()> { let _ = storage_clone.lock().unwrap().insert(hash, content.to_vec()); Ok(()) @@ -868,7 +875,7 @@ mod tests { // Encrypt the file let data_map = streaming_encrypt_from_file(&file_path, store)?; - + // Convert stored chunks to EncryptedChunk format let stored = storage.lock().unwrap(); let encrypted_chunks: Vec<_> = stored @@ -881,7 +888,7 @@ mod tests { // Decrypt and verify let decrypted = decrypt(&data_map, &encrypted_chunks)?; assert_eq!(&original_data[..], &decrypted[..]); - + Ok(()) } @@ -916,10 +923,11 @@ mod tests { // Create a very large data map (12 chunks) let original_map = create_dummy_data_map(100000); - let (shrunk_map, _shrink_chunks) = shrink_data_map(original_map.clone(), |hash, content| { - let _ = storage_clone.lock().unwrap().insert(hash, content); - Ok(()) - })?; + let (shrunk_map, _shrink_chunks) = + shrink_data_map(original_map.clone(), |hash, content| { + let _ = storage_clone.lock().unwrap().insert(hash, content); + Ok(()) + })?; // Verify multiple levels of shrinking occurred assert!(shrunk_map.child().unwrap() > 0); @@ -932,4 +940,76 @@ mod tests { Ok(()) } + + #[test] + fn test_encryption_algorithm_consistency() -> Result<()> { + // Create deterministic test data + let test_data = vec![42u8; MIN_ENCRYPTABLE_BYTES * 2]; // Repeating value for predictability + + // First encryption + let storage1 = Arc::new(Mutex::new(HashMap::new())); + let storage1_clone = storage1.clone(); + + let store1 = move |hash: XorName, content: Bytes| -> Result<()> { + let _ = storage1_clone + .lock() + .unwrap() + .insert(hash, content.to_vec()); + Ok(()) + }; + + // Second encryption + let storage2 = Arc::new(Mutex::new(HashMap::new())); + let storage2_clone = storage2.clone(); + + let store2 = move |hash: XorName, content: Bytes| -> Result<()> { + let _ = storage2_clone + .lock() + .unwrap() + .insert(hash, content.to_vec()); + Ok(()) + }; + + // Create temporary files with same content + let temp_dir = tempfile::TempDir::new()?; + let file_path1 = temp_dir.path().join("test1.bin"); + let file_path2 = temp_dir.path().join("test2.bin"); + + std::fs::write(&file_path1, &test_data)?; + std::fs::write(&file_path2, &test_data)?; + + // Encrypt same data twice + let data_map1 = streaming_encrypt_from_file(&file_path1, store1)?; + let data_map2 = streaming_encrypt_from_file(&file_path2, store2)?; + + // Compare data maps + assert_eq!( + data_map1.chunk_identifiers.len(), + data_map2.chunk_identifiers.len(), + "Data maps should have same number of chunks" + ); + + // Compare stored chunks + let stored1 = storage1.lock().unwrap(); + let stored2 = storage2.lock().unwrap(); + + assert_eq!( + stored1.len(), + stored2.len(), + "Should have same number of stored chunks" + ); + + // Compare each chunk's content + for (hash, content1) in stored1.iter() { + let content2 = stored2 + .get(hash) + .expect("Chunk should exist in both storages"); + assert_eq!( + content1, content2, + "Encrypted chunks should be identical for same input" + ); + } + + Ok(()) + } } diff --git a/src/python.rs b/src/python.rs index 5f8656089..f410ee920 100644 --- a/src/python.rs +++ b/src/python.rs @@ -1,617 +1,160 @@ -/// Python bindings for self-encryption functionality. -/// -/// This module provides Python bindings for the self-encryption library, which implements -/// convergent encryption with content deduplication and obfuscation. The library splits -/// data into chunks, encrypts them independently, and creates a data map that describes -/// how to reassemble the original data. -/// -/// Key Features: -/// - Content-based chunking for deduplication -/// - Convergent encryption with obfuscation -/// - Self-validating chunks through content hashing -/// - Streaming operations for large files -/// - Parallel chunk processing -/// - Both in-memory and file-based operations -/// -/// Basic Usage: -/// ```python -/// from self_encryption import encrypt, decrypt -/// -/// # In-memory encryption -/// data = b"Hello, World!" * 1000 # Must be at least 3072 bytes -/// data_map, chunks = encrypt(data) -/// decrypted = decrypt(data_map, chunks) -/// assert data == decrypted -/// -/// # File-based encryption -/// from pathlib import Path -/// data_map, chunk_names = encrypt_from_file("input.dat", "chunks/") -/// -/// # Custom storage backend -/// def get_chunk(hash_hex): -/// return (Path("chunks") / hash_hex).read_bytes() -/// decrypt_from_storage(data_map, "output.dat", get_chunk) -/// ``` +use pyo3::prelude::*; +use bytes::Bytes; use crate::{ - decrypt as rust_decrypt, decrypt_from_storage as rust_decrypt_from_storage, - encrypt as rust_encrypt, encrypt_from_file as rust_encrypt_from_file, - shrink_data_map as rust_shrink_data_map, + DataMap, + XorName, + Error, + encrypt_from_file as rust_encrypt_from_file, + decrypt_from_storage as rust_decrypt_from_storage, streaming_decrypt_from_storage as rust_streaming_decrypt_from_storage, - streaming_encrypt_from_file as rust_streaming_encrypt_from_file, - ChunkInfo, DataMap as RustDataMap, EncryptedChunk as RustEncryptedChunk, Error, Result, }; -use bytes::Bytes; -use pyo3::prelude::*; -use pyo3::types::{PyBytes, PyType}; -use std::path::PathBuf; -use xor_name::XorName; -#[pyclass(name = "DataMap")] -/// A data map containing information about encrypted chunks. -/// -/// The DataMap contains metadata about how a file was split and encrypted into chunks, -/// including the hashes needed to verify and decrypt the chunks. -/// -/// Attributes: -/// child (Optional[int]): The child level of this data map, if it's part of a hierarchy -/// len (int): The number of chunks in this data map -/// -/// Methods: -/// is_child() -> bool: Check if this is a child data map -/// infos() -> List[Tuple[int, bytes, bytes, int]]: Get chunk information +// Make DataMap usable from Python +#[pyclass] #[derive(Clone)] -struct PyDataMap { - inner: RustDataMap, +pub struct PyDataMap { + inner: DataMap, } -#[pyclass(name = "EncryptedChunk")] -/// An encrypted chunk of data. -/// -/// Represents a single encrypted chunk of data that was created during the encryption process. -/// -/// Methods: -/// content() -> bytes: Get the encrypted content of this chunk -/// from_bytes(content: bytes) -> EncryptedChunk: Create a new chunk from bytes -#[derive(Clone)] -struct PyEncryptedChunk { - inner: RustEncryptedChunk, -} - -#[pyclass(name = "XorName")] -#[derive(Clone)] -struct PyXorName { - inner: XorName, -} #[pymethods] impl PyDataMap { #[new] - /// Create a new DataMap from chunk information. - /// - /// Args: - /// chunk_infos: List of tuples containing (index, dst_hash, src_hash, src_size) - /// - /// Returns: - /// DataMap: A new data map instance - fn new(chunk_infos: Vec<(usize, Vec, Vec, usize)>) -> Self { - let infos = chunk_infos - .into_iter() - .map(|(index, dst_hash, src_hash, src_size)| ChunkInfo { - index, - dst_hash: XorName::from_content(&dst_hash), - src_hash: XorName::from_content(&src_hash), - src_size, - }) - .collect(); - Self { - inner: RustDataMap::new(infos), + pub fn new() -> Self { + PyDataMap { + inner: DataMap::new(vec![]) } } - #[staticmethod] - /// Create a new DataMap with a child level. - /// - /// Args: - /// chunk_infos: List of tuples containing (index, dst_hash, src_hash, src_size) - /// child: The child level for this data map - /// - /// Returns: - /// DataMap: A new data map instance with the specified child level - fn with_child(chunk_infos: Vec<(usize, Vec, Vec, usize)>, child: usize) -> Self { - let infos = chunk_infos - .into_iter() - .map(|(index, dst_hash, src_hash, src_size)| ChunkInfo { - index, - dst_hash: XorName::from_content(&dst_hash), - src_hash: XorName::from_content(&src_hash), - src_size, - }) - .collect(); - Self { - inner: RustDataMap::with_child(infos, child), - } + pub fn __str__(&self) -> PyResult { + Ok(format!("{:?}", self.inner)) } - /// Get the child level of this data map. - /// - /// Returns: - /// Optional[int]: The child level if this is a child data map, None otherwise - fn child(&self) -> Option { - self.inner.child() + pub fn to_json(&self) -> PyResult { + serde_json::to_string(&self.inner) + .map_err(|e| PyErr::new::(e.to_string())) } - /// Check if this is a child data map. - /// - /// Returns: - /// bool: True if this is a child data map, False otherwise - fn is_child(&self) -> bool { - self.inner.is_child() - } - - /// Get the number of chunks in this data map. - /// - /// Returns: - /// int: The number of chunks - fn len(&self) -> usize { - self.inner.len() - } - - /// Get information about all chunks in this data map. - /// - /// Returns: - /// List[Tuple[int, bytes, bytes, int]]: List of tuples containing - /// (index, dst_hash, src_hash, src_size) for each chunk - fn infos(&self) -> Vec<(usize, Vec, Vec, usize)> { - self.inner - .infos() - .into_iter() - .map(|info| { - ( - info.index, - info.dst_hash.0.to_vec(), - info.src_hash.0.to_vec(), - info.src_size, - ) - }) - .collect() + #[staticmethod] + pub fn from_json(json_str: &str) -> PyResult { + serde_json::from_str(json_str) + .map(|inner| PyDataMap { inner }) + .map_err(|e| PyErr::new::(e.to_string())) } } -#[pymethods] -impl PyEncryptedChunk { - #[new] - /// Create a new EncryptedChunk from bytes. - /// - /// Args: - /// content (bytes): The encrypted content - /// - /// Returns: - /// EncryptedChunk: A new encrypted chunk instance - fn new(content: Vec) -> Self { - Self { - inner: RustEncryptedChunk { - content: Bytes::from(content), - }, - } +impl From for PyDataMap { + fn from(inner: DataMap) -> Self { + PyDataMap { inner } } +} - /// Get the content of this chunk. - /// - /// Returns: - /// bytes: The encrypted content - fn content(&self) -> &[u8] { - &self.inner.content - } +// Make XorName usable from Python +#[pyclass] +#[derive(Clone)] +pub struct PyXorName { + inner: XorName, +} - #[classmethod] - /// Create a new EncryptedChunk from Python bytes. - /// - /// Args: - /// content (bytes): The encrypted content - /// - /// Returns: - /// EncryptedChunk: A new encrypted chunk instance - fn from_bytes(_cls: &PyType, content: &PyBytes) -> PyResult { - Ok(Self::new(content.as_bytes().to_vec())) +impl PyXorName { + pub fn new(inner: XorName) -> Self { + PyXorName { inner } } } #[pymethods] impl PyXorName { #[new] - fn new(bytes: &PyBytes) -> Self { - Self { - inner: XorName::from_content(bytes.as_bytes()), + fn py_new(bytes: &[u8]) -> PyResult { + if bytes.len() != 32 { + return Err(PyErr::new::( + "XorName must be exactly 32 bytes" + )); } + let mut array = [0u8; 32]; + array.copy_from_slice(bytes); + Ok(PyXorName { + inner: XorName(array) + }) } - #[staticmethod] - fn from_content(content: &PyBytes) -> Self { - Self { - inner: XorName::from_content(content.as_bytes()), - } + fn __str__(&self) -> PyResult { + Ok(hex::encode(self.inner.0)) } +} - fn as_bytes(&self) -> Vec { - self.inner.0.to_vec() +impl From for PyXorName { + fn from(inner: XorName) -> Self { + PyXorName { inner } } } -#[pyfunction] -/// Encrypt data in memory. -/// -/// This function takes raw bytes, splits them into chunks, and encrypts each chunk -/// independently. It returns both the data map (which describes how to reassemble -/// the chunks) and the encrypted chunks themselves. -/// -/// The minimum input size is 3072 bytes, as this is required for the chunking -/// algorithm to work effectively. -/// -/// Args: -/// data (bytes): The data to encrypt (must be at least 3072 bytes) -/// -/// Returns: -/// Tuple[DataMap, List[EncryptedChunk]]: A tuple containing: -/// - DataMap: Contains metadata about the encrypted chunks -/// - List[EncryptedChunk]: The list of encrypted chunks -/// -/// Raises: -/// ValueError: If the input data is too small or encryption fails -/// -/// Example: -/// ```python -/// data = b"Hello, World!" * 1000 -/// data_map, chunks = encrypt(data) -/// ``` -fn encrypt(_py: Python<'_>, data: &PyBytes) -> PyResult<(PyDataMap, Vec)> { - let bytes = Bytes::from(data.as_bytes().to_vec()); - let (data_map, chunks) = rust_encrypt(bytes) - .map_err(|e| PyErr::new::(e.to_string()))?; - - Ok(( - PyDataMap { inner: data_map }, - chunks - .into_iter() - .map(|c| PyEncryptedChunk { inner: c }) - .collect(), - )) +// Create a Python tuple type for our return value +#[pyclass] +pub struct EncryptResult { + #[pyo3(get)] + data_map: PyDataMap, + #[pyo3(get)] + names: Vec, } -#[pyfunction] -/// Encrypt a file and store chunks to disk. -/// -/// This function reads a file, splits it into chunks, encrypts each chunk, and -/// stores the encrypted chunks in the specified output directory. The chunks are -/// named using their content hash (XorName) in hexadecimal format. -/// -/// The input file must be at least 3072 bytes in size. -/// -/// Args: -/// input_path (str): Path to the input file to encrypt -/// output_dir (str): Directory where encrypted chunks will be stored -/// -/// Returns: -/// Tuple[DataMap, List[str]]: A tuple containing: -/// - DataMap: Contains metadata about the encrypted chunks -/// - List[str]: List of chunk filenames (hex format of their XorNames) -/// -/// Raises: -/// ValueError: If the input file is too small or encryption fails -/// OSError: If there are file system errors -/// -/// Example: -/// ```python -/// data_map, chunk_names = encrypt_from_file("input.dat", "chunks/") -/// print(f"Created {len(chunk_names)} chunks") -/// ``` -fn encrypt_from_file(input_path: String, output_dir: String) -> PyResult<(PyDataMap, Vec)> { - let (data_map, chunk_names) = - rust_encrypt_from_file(&PathBuf::from(input_path), &PathBuf::from(output_dir)) - .map_err(|e| PyErr::new::(e.to_string()))?; +#[pymodule] +fn _self_encryption(py: Python<'_>, m: &PyModule) -> PyResult<()> { + // Register classes + m.add_class::()?; + m.add_class::()?; + m.add_class::()?; - Ok(( - PyDataMap { inner: data_map }, - chunk_names - .into_iter() - .map(|name| hex::encode(name.0)) - .collect(), - )) -} + // Register constants + m.setattr("MIN_CHUNK_SIZE", 1i32)?; + m.setattr("MIN_ENCRYPTABLE_BYTES", 3i32)?; + m.setattr("MAX_CHUNK_SIZE", 1024 * 1024i32)?; + m.setattr("COMPRESSION_QUALITY", 6i32)?; -#[pyfunction] -/// Decrypt data using provided chunks in memory. -/// -/// This function takes a data map and its corresponding encrypted chunks and -/// reassembles the original data. All operations are performed in memory. -/// -/// Args: -/// data_map (DataMap): The data map containing chunk metadata -/// chunks (List[EncryptedChunk]): The encrypted chunks to decrypt -/// -/// Returns: -/// bytes: The decrypted original data -/// -/// Raises: -/// ValueError: If decryption fails or chunks are invalid -/// -/// Example: -/// ```python -/// decrypted = decrypt(data_map, chunks) -/// with open("output.dat", "wb") as f: -/// f.write(decrypted) -/// ``` -fn decrypt(data_map: &PyDataMap, chunks: Vec) -> PyResult> { - let chunks: Vec = chunks.into_iter().map(|c| c.inner).collect(); - let result = rust_decrypt(&data_map.inner, &chunks) - .map_err(|e| PyErr::new::(e.to_string()))?; + // Register functions + m.add_function(wrap_pyfunction!(encrypt_from_file, py)?)?; + m.add_function(wrap_pyfunction!(decrypt_from_storage, py)?)?; + m.add_function(wrap_pyfunction!(streaming_decrypt_from_storage, py)?)?; - Python::with_gil(|py| Ok(PyBytes::new(py, &result).into())) + Ok(()) } #[pyfunction] -/// Decrypt data using chunks from storage, writing directly to a file. -/// -/// This function decrypts data by retrieving chunks from a storage backend using -/// the provided callback function. It writes the decrypted data directly to the -/// specified output file, making it suitable for large files that shouldn't be -/// held entirely in memory. -/// -/// Args: -/// data_map (DataMap): The data map containing chunk metadata -/// output_path (str): Path where the decrypted file will be written -/// get_chunk (Callable[[str], bytes]): Function that retrieves chunks by their hex name -/// -/// Raises: -/// ValueError: If decryption fails or chunks are invalid -/// OSError: If there are file system errors -/// -/// Example: -/// ```python -/// def get_chunk(hex_name): -/// chunk_path = Path("chunks") / hex_name -/// return chunk_path.read_bytes() -/// -/// decrypt_from_storage(data_map, "output.dat", get_chunk) -/// ``` -fn decrypt_from_storage( - py: Python<'_>, - data_map: &PyDataMap, - output_path: String, - py_get_chunk: PyObject, -) -> PyResult<()> { - let mut get_chunk = |hash: XorName| -> Result { - let hash_hex = hex::encode(hash.0); - let result = py_get_chunk - .call1(py, (hash_hex,)) - .map_err(|e| Error::Generic(format!("Python callback error: {}", e)))?; - let chunk_data: Vec = result - .extract(py) - .map_err(|e| Error::Generic(format!("Python data extraction error: {}", e)))?; - Ok(Bytes::from(chunk_data)) - }; - - rust_decrypt_from_storage(&data_map.inner, &PathBuf::from(output_path), &mut get_chunk) +fn encrypt_from_file(file_path: &str, output_dir: &str) -> PyResult { + let path = std::path::Path::new(file_path); + let out_path = std::path::Path::new(output_dir); + rust_encrypt_from_file(path, out_path) + .map(|(data_map, names)| EncryptResult { + data_map: PyDataMap::from(data_map), + names: names.into_iter().map(PyXorName::from).collect(), + }) .map_err(|e| PyErr::new::(e.to_string())) } #[pyfunction] -/// Decrypt data with parallel chunk retrieval for improved performance. -/// -/// This function is optimized for performance when working with large files and -/// remote storage backends. It retrieves multiple chunks in parallel and writes -/// the decrypted data directly to a file. -/// -/// Key features: -/// - Parallel chunk retrieval for better throughput -/// - Streaming write to output file -/// - Memory efficient for large files -/// - Optimized for remote storage backends -/// -/// Args: -/// data_map (DataMap): The data map containing chunk metadata -/// output_path (str): Path where the decrypted file will be written -/// get_chunks (Callable[[List[str]], List[bytes]]): Function that retrieves multiple -/// chunks in parallel, given their hex names. Should accept a list of hex names -/// and return a list of chunk contents in the same order. -/// -/// Raises: -/// ValueError: If decryption fails or chunks are invalid -/// OSError: If there are file system errors -/// -/// Example: -/// ```python -/// from concurrent.futures import ThreadPoolExecutor -/// from pathlib import Path -/// -/// def get_chunks(hex_names): -/// # Parallel chunk retrieval using thread pool -/// def get_one(name): -/// chunk_path = Path("chunks") / name -/// return chunk_path.read_bytes() -/// -/// with ThreadPoolExecutor(max_workers=4) as executor: -/// return list(executor.map(get_one, hex_names)) -/// -/// streaming_decrypt_from_storage(data_map, "output.dat", get_chunks) -/// ``` -fn streaming_decrypt_from_storage( - py: Python<'_>, - data_map: &PyDataMap, - output_path: String, - py_get_chunks: PyObject, -) -> PyResult<()> { - let get_chunk_parallel = |hashes: &[XorName]| -> Result> { - let hash_hexes: Vec = hashes.iter().map(|h| hex::encode(h.0)).collect(); - let chunks = py_get_chunks - .call1(py, (hash_hexes,)) - .map_err(|e| Error::Generic(format!("Python callback error: {}", e)))?; - let chunk_data: Vec> = chunks - .extract(py) - .map_err(|e| Error::Generic(format!("Python data extraction error: {}", e)))?; - Ok(chunk_data.into_iter().map(Bytes::from).collect()) - }; - - rust_streaming_decrypt_from_storage( - &data_map.inner, - &PathBuf::from(output_path), - get_chunk_parallel, - ) +fn decrypt_from_storage(data_map: &PyDataMap, output_file: &str, chunks_dir: &str) -> PyResult<()> { + let out_path = std::path::Path::new(output_file); + let chunks_path = std::path::Path::new(chunks_dir); + rust_decrypt_from_storage(&data_map.inner, out_path, |hash| { + let chunk_path = chunks_path.join(hex::encode(hash)); + std::fs::read(chunk_path) + .map(Bytes::from) + .map_err(|e| Error::Generic(e.to_string())) + }) .map_err(|e| PyErr::new::(e.to_string())) } #[pyfunction] -/// Stream-encrypt a file and store chunks using a custom storage backend. -/// -/// This function reads and encrypts a file in chunks, using less memory than the standard -/// encrypt_from_file function. It's ideal for large files and custom storage backends. -/// -/// Key features: -/// - Memory efficient streaming encryption -/// - Custom storage backend support -/// - Parallel processing where possible -/// - Maintains encryption security properties -/// -/// Args: -/// input_path (str): Path to the input file to encrypt -/// store_chunk (Callable[[str, bytes], None]): Function to store encrypted chunks. -/// Takes two arguments: -/// - chunk_name (str): The hex name (XorName) of the chunk -/// - content (bytes): The encrypted chunk content -/// -/// Returns: -/// DataMap: Contains metadata about the encrypted chunks -/// -/// Raises: -/// ValueError: If the input file is too small or encryption fails -/// OSError: If there are file system errors -/// -/// Example: -/// ```python -/// def store_chunk(name, content): -/// chunk_path = Path("chunks") / name -/// chunk_path.write_bytes(content) -/// -/// data_map = streaming_encrypt_from_file("large_file.dat", store_chunk) -/// print(f"Created {data_map.len()} chunks") -/// ``` -fn streaming_encrypt_from_file( - _py: Python<'_>, - input_path: String, - py_store_chunk: PyObject, -) -> PyResult { - let input_path = PathBuf::from(input_path); - - // Create a closure that calls the Python store_chunk function - let mut store_chunk = |name: XorName, content: Bytes| -> Result<()> { - // Convert data outside of the Python context - let bytes: &[u8] = name.as_ref(); - let name_hex = hex::encode(bytes); - let content_bytes = content.to_vec(); - - // Acquire the GIL and call the Python function - let result: Result<()> = Python::with_gil(|py| { - let args = (name_hex, PyBytes::new(py, &content_bytes)); - py_store_chunk - .call1(py, args) - .map(|_| ()) - .map_err(|e| Error::Python(format!("Store chunk failed: {}", e)))?; - Ok(()) - }); - result - }; - - // Call the Rust streaming encryption function - match rust_streaming_encrypt_from_file(&input_path, &mut store_chunk) { - Ok(data_map) => Ok(PyDataMap { inner: data_map }), - Err(e) => Err(PyErr::new::(format!( - "Streaming encryption failed: {}", - e - ))), - } -} - -#[pyfunction] -/// Verify the integrity of an encrypted chunk. -/// -/// This function checks if an encrypted chunk's content matches its XorName, -/// ensuring the chunk hasn't been corrupted or tampered with. -/// -/// Args: -/// name (XorName): The expected XorName of the chunk -/// content (bytes): The chunk's content to verify -/// -/// Returns: -/// EncryptedChunk: A verified encrypted chunk object -/// -/// Raises: -/// ValueError: If the chunk's content doesn't match its name -/// -/// Example: -/// ```python -/// chunk = verify_chunk(name, content) -/// assert chunk.content() == content -/// ``` -fn verify_chunk(name: &PyXorName, content: &PyBytes) -> PyResult { - match crate::verify_chunk(name.inner, content.as_bytes()) { - Ok(chunk) => Ok(PyEncryptedChunk { inner: chunk }), - Err(e) => Err(PyErr::new::( - e.to_string(), - )), - } -} - -#[pyfunction] -/// Shrink a data map by recursively encrypting it. -/// -/// This is useful for handling large files that produce large data maps. -/// -/// Args: -/// data_map (DataMap): The data map to shrink -/// store_chunk (Callable[[str, bytes], None]): Function to store new chunks -/// -/// Returns: -/// Tuple[DataMap, List[EncryptedChunk]]: The shrunk data map and new chunks -/// -/// Raises: -/// ValueError: If shrinking fails -fn shrink_data_map( - py: Python<'_>, - data_map: &PyDataMap, - py_store_chunk: PyObject, -) -> PyResult<(PyDataMap, Vec)> { - let mut store_chunk = |hash: XorName, content: Bytes| -> Result<()> { - let hash_hex = hex::encode(hash.0); - let content_vec = content.to_vec(); - let _ = py_store_chunk - .call1(py, (hash_hex, content_vec)) - .map_err(|e| Error::Generic(format!("Python callback error: {}", e)))?; - Ok(()) - }; - - let (shrunk_map, chunks) = rust_shrink_data_map(data_map.inner.clone(), &mut store_chunk) - .map_err(|e| PyErr::new::(e.to_string()))?; - - Ok(( - PyDataMap { inner: shrunk_map }, - chunks - .into_iter() - .map(|c| PyEncryptedChunk { inner: c }) - .collect(), - )) -} - -#[pymodule] -fn _self_encryption(_py: Python<'_>, m: &PyModule) -> PyResult<()> { - m.add_class::()?; - m.add_class::()?; - m.add_class::()?; - m.add_function(wrap_pyfunction!(encrypt, m)?)?; - m.add_function(wrap_pyfunction!(encrypt_from_file, m)?)?; - m.add_function(wrap_pyfunction!(decrypt, m)?)?; - m.add_function(wrap_pyfunction!(decrypt_from_storage, m)?)?; - m.add_function(wrap_pyfunction!(shrink_data_map, m)?)?; - m.add_function(wrap_pyfunction!(streaming_decrypt_from_storage, m)?)?; - m.add_function(wrap_pyfunction!(streaming_encrypt_from_file, m)?)?; - m.add_function(wrap_pyfunction!(verify_chunk, m)?)?; - Ok(()) +fn streaming_decrypt_from_storage(data_map: &PyDataMap, output_file: &str, chunks_dir: &str) -> PyResult<()> { + let out_path = std::path::Path::new(output_file); + let chunks_path = std::path::Path::new(chunks_dir); + rust_streaming_decrypt_from_storage(&data_map.inner, out_path, |hashes| { + hashes.iter().map(|hash| { + let chunk_path = chunks_path.join(hex::encode(hash)); + std::fs::read(chunk_path) + .map(Bytes::from) + .map_err(|e| Error::Generic(e.to_string())) + }).collect() + }) + .map_err(|e| PyErr::new::(e.to_string())) } diff --git a/tests/lib.rs b/tests/lib.rs index 30e04f857..d55a11d84 100644 --- a/tests/lib.rs +++ b/tests/lib.rs @@ -34,7 +34,6 @@ unused_features, unused_parens, while_true, - warnings )] #![warn( trivial_casts, @@ -205,3 +204,4 @@ fn test_data_map_len_and_is_child() { assert_eq!(empty_data_map.len(), 0); assert!(!empty_data_map.is_child()); } + diff --git a/tests/test_bindings.py b/tests/test_bindings.py index 1d80f11fd..c51927cd4 100644 --- a/tests/test_bindings.py +++ b/tests/test_bindings.py @@ -5,29 +5,12 @@ import pytest from self_encryption import ( DataMap, - EncryptedChunk, XorName, - encrypt, encrypt_from_file, - decrypt, decrypt_from_storage, - shrink_data_map, streaming_decrypt_from_storage, - verify_chunk, ) -def test_basic_encryption_decryption(): - # Test data - data = b"x" * 10_000_000 # 10MB of data - - # Encrypt - data_map, chunks = encrypt(data) - assert len(chunks) > 0 - - # Decrypt - decrypted = decrypt(data_map, chunks) - assert data == decrypted - def test_file_encryption_decryption(): with tempfile.TemporaryDirectory() as temp_dir: # Create test file @@ -40,94 +23,16 @@ def test_file_encryption_decryption(): chunk_dir.mkdir() # Encrypt file - data_map, chunk_names = encrypt_from_file(str(input_path), str(chunk_dir)) - - # Create chunk retrieval function - def get_chunk(hash_hex: str) -> bytes: - chunk_path = chunk_dir / hash_hex - return chunk_path.read_bytes() + result = encrypt_from_file(str(input_path), str(chunk_dir)) + data_map = result.data_map # Decrypt to new file output_path = Path(temp_dir) / "output.dat" - decrypt_from_storage(data_map, str(output_path), get_chunk) + decrypt_from_storage(data_map, str(output_path), str(chunk_dir)) # Verify assert input_path.read_bytes() == output_path.read_bytes() -def test_data_map_shrinking(): - # Create large data to ensure multiple chunks - data = b"x" * 10_000_000 - - # Encrypt - data_map, chunks = encrypt(data) - - # Track stored chunks - stored_chunks = {} - def store_chunk(hash_hex: str, content: bytes) -> None: - stored_chunks[hash_hex] = content - - # Shrink data map - shrunk_map, shrink_chunks = shrink_data_map(data_map, store_chunk) - - # Verify child level is set - assert shrunk_map.child() is not None - assert shrunk_map.is_child() - - # Collect all chunks - all_chunks = chunks + shrink_chunks - - # Decrypt using all chunks - decrypted = decrypt(shrunk_map, all_chunks) - assert data == decrypted - -def test_comprehensive_encryption_decryption(): - test_sizes = [ - (2 * 1024 * 1024, "2MB"), - (5 * 1024 * 1024, "5MB"), - (10 * 1024 * 1024, "10MB"), - ] - - for size, name in test_sizes: - print(f"\nTesting {name} file") - data = b"x" * size - - # Test in-memory encryption/decryption - data_map1, chunks1 = encrypt(data) - decrypted1 = decrypt(data_map1, chunks1) - assert data == decrypted1 - print(f"✓ In-memory encryption/decryption successful") - - # Test file-based encryption/decryption - with tempfile.TemporaryDirectory() as temp_dir: - # Setup paths - input_path = Path(temp_dir) / "input.dat" - chunk_dir = Path(temp_dir) / "chunks" - output_path = Path(temp_dir) / "output.dat" - - # Write test data - input_path.write_bytes(data) - chunk_dir.mkdir() - - # Encrypt file - data_map2, chunk_names = encrypt_from_file(str(input_path), str(chunk_dir)) - - # Create chunk retrieval function - def get_chunk(hash_hex: str) -> bytes: - chunk_path = chunk_dir / hash_hex - return chunk_path.read_bytes() - - # Decrypt file - decrypt_from_storage(data_map2, str(output_path), get_chunk) - - # Verify - assert data == output_path.read_bytes() - print(f"✓ File-based encryption/decryption successful") - - # Verify data maps - assert data_map1.len() == data_map2.len() - assert data_map1.child() == data_map2.child() - print(f"✓ Data maps match") - def test_streaming_decryption(): with tempfile.TemporaryDirectory() as temp_dir: # Create test file @@ -140,51 +45,15 @@ def test_streaming_decryption(): chunk_dir.mkdir() # Encrypt file - data_map, chunk_names = encrypt_from_file(str(input_path), str(chunk_dir)) - - # Create parallel chunk retrieval function - def get_chunks(hash_hexes: List[str]) -> List[bytes]: - return [ - (chunk_dir / hash_hex).read_bytes() - for hash_hex in hash_hexes - ] + result = encrypt_from_file(str(input_path), str(chunk_dir)) + data_map = result.data_map # Decrypt using streaming output_path = Path(temp_dir) / "output.dat" - streaming_decrypt_from_storage(data_map, str(output_path), get_chunks) + streaming_decrypt_from_storage(data_map, str(output_path), str(chunk_dir)) # Verify assert input_path.read_bytes() == output_path.read_bytes() -def test_verify_chunk(): - # Create some test data and encrypt it - data = b"x" * 10_000_000 - data_map, chunks = encrypt(data) - - # Get the first chunk and its hash - chunk = chunks[0] - chunk_info = data_map.infos()[0] - - # Use dst_hash from chunk info and content from chunk - chunk_content = chunk.content() - - # Create XorName from the content - xor_name = XorName.from_content(chunk_content) - - # Print debug info - print(f"Chunk hash (XorName): {''.join(format(b, '02x') for b in xor_name.as_bytes())}") - print(f"Content length: {len(chunk_content)}") - - # Verify valid chunk - verified_chunk = verify_chunk(xor_name, chunk_content) - assert isinstance(verified_chunk, EncryptedChunk) - assert verified_chunk.content() == chunk_content - - # Test with corrupted content - corrupted_content = bytearray(chunk_content) - corrupted_content[0] ^= 1 # Flip one bit - with pytest.raises(ValueError, match="Chunk content hash mismatch"): - verify_chunk(xor_name, bytes(corrupted_content)) - if __name__ == "__main__": pytest.main([__file__]) \ No newline at end of file diff --git a/tests/test_cli.py b/tests/test_cli.py new file mode 100644 index 000000000..4ba4df0bf --- /dev/null +++ b/tests/test_cli.py @@ -0,0 +1,69 @@ +import pytest +from click.testing import CliRunner +from self_encryption.cli import cli +import tempfile +import os +from pathlib import Path + +@pytest.fixture +def runner(): + return CliRunner() + +@pytest.fixture +def test_data(): + return b"Hello, World!" * 1000 # Make it large enough to encrypt + +def test_encrypt_decrypt_flow(runner): + with tempfile.TemporaryDirectory() as tmpdir: + input_file = Path(tmpdir) / "input.dat" + input_file.write_bytes(b"Hello, World!" * 100) # Make it large enough + chunks_dir = Path(tmpdir) / "chunks" + chunks_dir.mkdir() + data_map_file = Path(tmpdir) / "data_map.json" + + # Test encryption + result = runner.invoke(cli, ['encrypt-file', str(input_file), str(chunks_dir)]) + print(f"Encryption error output: {result.output}") + assert result.exit_code == 0 + + # Save data map to file + data_map_file.write_text(result.output) + + output_file = Path(tmpdir) / "output.dat" + + # Test decryption + result = runner.invoke(cli, [ + 'decrypt-file', + str(data_map_file), + str(chunks_dir), + str(output_file) + ]) + assert result.exit_code == 0 + + # Verify content + assert output_file.read_bytes() == b"Hello, World!" * 100 + +def test_streaming_decrypt(runner, test_data): + with tempfile.TemporaryDirectory() as tmpdir: + input_file = Path(tmpdir) / "input.dat" + input_file.write_bytes(test_data) + chunks_dir = Path(tmpdir) / "chunks" + chunks_dir.mkdir() + data_map_file = Path(tmpdir) / "data_map.json" + output_file = Path(tmpdir) / "output.dat" + + # Encrypt + result = runner.invoke(cli, ['encrypt-file', str(input_file), str(chunks_dir)]) + assert result.exit_code == 0 + data_map_file.write_text(result.output) + + # Test streaming decryption + result = runner.invoke(cli, [ + 'decrypt-file', + '--streaming', + str(data_map_file), + str(chunks_dir), + str(output_file) + ]) + assert result.exit_code == 0 + assert output_file.read_bytes() == test_data \ No newline at end of file