diff --git a/.github/workflows/miri.yml b/.github/workflows/miri.yml new file mode 100644 index 000000000000..bdceb20c3008 --- /dev/null +++ b/.github/workflows/miri.yml @@ -0,0 +1,46 @@ +name: Miri +on: + push: + pull_request: +concurrency: + group: ${{ github.repository }}-${{ github.ref }}-${{ github.head_ref }}-${{ github.workflow }} + # Only cancel in PR mode. In push mode, don't cancel so we don't see spurious test "failures", + # and we get coverage reports on Coveralls for every push. + cancel-in-progress: ${{ github.event_name == 'pull_request' }} + +jobs: + miri: + if: github.repository_owner == 'Qiskit' + name: Miri + runs-on: ubuntu-latest + env: + RUSTUP_TOOLCHAIN: nightly + + steps: + - uses: actions/checkout@v4 + + - name: Install Rust toolchain + uses: dtolnay/rust-toolchain@nightly + with: + components: miri + + - name: Prepare Miri + run: | + set -e + # Some of our dependencies aren't Miri-safe with their current release versions. These + # need overriding with known-good versions to run against until the Miri-safe versions are + # released and updated in our Cargo.lock. + cat >>Cargo.toml <- + CARGO_BUILD_TARGET="aarch64-apple-darwin" + PYO3_CROSS_LIB_DIR="/Library/Frameworks/Python.framework/Versions/$(python -c 'import sys; print(str(sys.version_info[0])+"."+str(sys.version_info[1]))')/lib/python$(python -c 'import sys; print(str(sys.version_info[0])+"."+str(sys.version_info[1]))')" - uses: actions/upload-artifact@v4 with: path: ./wheelhouse/*.whl - name: wheels-${{ matrix.os }}-32 - build_wheels_macos_arm: - name: Build wheels on macOS arm + name: wheels-${{ matrix.os }}-arm + build_wheels_32bit: + name: Build wheels 32bit runs-on: ${{ matrix.os }} - environment: release strategy: fail-fast: false matrix: - os: [macos-11] + os: [ubuntu-latest, windows-latest] steps: - uses: actions/checkout@v4 - uses: actions/setup-python@v5 @@ -73,25 +84,23 @@ jobs: with: python-version: '3.10' - uses: dtolnay/rust-toolchain@stable + with: + components: llvm-tools-preview - name: Build wheels uses: pypa/cibuildwheel@v2.17.0 env: - CIBW_BEFORE_ALL: rustup target add aarch64-apple-darwin - CIBW_ARCHS_MACOS: arm64 universal2 - CIBW_ENVIRONMENT: >- - CARGO_BUILD_TARGET="aarch64-apple-darwin" - PYO3_CROSS_LIB_DIR="/Library/Frameworks/Python.framework/Versions/$(python -c 'import sys; print(str(sys.version_info[0])+"."+str(sys.version_info[1]))')/lib/python$(python -c 'import sys; print(str(sys.version_info[0])+"."+str(sys.version_info[1]))')" + CIBW_SKIP: 'pp* cp36-* cp37-* *musllinux* *amd64 *x86_64' - uses: actions/upload-artifact@v4 with: path: ./wheelhouse/*.whl - name: wheels-${{ matrix.os }}-arm + name: wheels-${{ matrix.os }}-32 upload_shared_wheels: name: Upload shared build wheels runs-on: ubuntu-latest environment: release permissions: id-token: write - needs: ["build_wheels", "build_wheels_macos_arm", "build_wheels_32bit"] + needs: ["build_wheels", "build_wheels_32bit", "build_wheels_macos_arm_py38"] steps: - uses: actions/download-artifact@v4 with: diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 89b47cc9c919..1ea3dc9f60f3 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -556,6 +556,58 @@ you just need to update the reference images as follows: Note: If you have run `test/ipynb/mpl_tester.ipynb` locally it is possible some file metadata has changed, **please do not commit and push changes to this file unless they were intentional**. + +### Testing Rust components + +Rust-accelerated functions are generally tested from Python space, but in cases +where there is Rust-specific internal details to be tested, `#[test]` functions +can be included inline. Typically it's most convenient to place these in a +separate inline module that is only conditionally compiled in, such as + +```rust +#[cfg(test)] +mod tests { + #[test] + fn my_first_test() { + assert_eq!(2, 1 + 1); + } +} +``` + +To run the Rust-space tests, do + +```bash +cargo test --no-default-features +``` + +Our Rust-space components are configured such that setting the +``-no-default-features`` flag will compile the test runner, but not attempt to +build a linked CPython extension module, which would cause linker failures. + +### Unsafe code and Miri + +Any `unsafe` code added to the Rust logic should be exercised by Rust-space +tests, in addition to the more complete Python test suite. In CI, we run the +Rust test suite under [Miri](https://github.com/rust-lang/miri) as an +undefined-behavior sanitizer. + +Miri is currently only available on `nightly` Rust channels, so to run it +locally you will need to ensure you have that channel available, such as by +```bash +rustup install nightly --components miri +``` + +After this, you can run the Miri test suite with +```bash +MIRIFLAGS="" cargo +nightly miri test +``` + +For the current set of `MIRIFLAGS` used by Qiskit's CI, see the +[`miri.yml`](https://github.com/Qiskit/qiskit/blob/main/.github/workflows/miri.yml) +GitHub Action file. This same file may also include patches to dependencies to +make them compatible with Miri, which you would need to temporarily apply as +well. + ## Style and lint Qiskit uses three tools for verify code formatting and lint checking. The diff --git a/Cargo.lock b/Cargo.lock index 6859c622967d..dee434e870a4 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -589,6 +589,15 @@ dependencies = [ "either", ] +[[package]] +name = "itertools" +version = "0.12.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ba291022dbbd398a455acf126c1e341954079855bc60dfdda641363bd6922569" +dependencies = [ + "either", +] + [[package]] name = "jod-thread" version = "0.1.2" @@ -1079,6 +1088,7 @@ dependencies = [ "faer-ext", "hashbrown 0.14.3", "indexmap 2.2.6", + "itertools 0.12.1", "ndarray", "num-bigint", "num-complex", diff --git a/crates/accelerate/Cargo.toml b/crates/accelerate/Cargo.toml index 05ca3f5b6395..a43fdc6ff506 100644 --- a/crates/accelerate/Cargo.toml +++ b/crates/accelerate/Cargo.toml @@ -21,6 +21,7 @@ num-complex = "0.4" num-bigint = "0.4" rustworkx-core = "0.14" faer = "0.18.2" +itertools = "0.12.1" qiskit-circuit.workspace = true [dependencies.smallvec] diff --git a/crates/accelerate/src/isometry.rs b/crates/accelerate/src/isometry.rs new file mode 100644 index 000000000000..8d0761666bb6 --- /dev/null +++ b/crates/accelerate/src/isometry.rs @@ -0,0 +1,367 @@ +// This code is part of Qiskit. +// +// (C) Copyright IBM 2024 +// +// This code is licensed under the Apache License, Version 2.0. You may +// obtain a copy of this license in the LICENSE.txt file in the root directory +// of this source tree or at http://www.apache.org/licenses/LICENSE-2.0. +// +// Any modifications or derivative works of this code must retain this +// copyright notice, and modified files need to carry a notice indicating +// that they have been altered from the originals. + +use std::ops::BitAnd; + +use approx::abs_diff_eq; +use num_complex::{Complex64, ComplexFloat}; +use pyo3::prelude::*; +use pyo3::wrap_pyfunction; +use pyo3::Python; + +use hashbrown::HashSet; +use itertools::Itertools; +use ndarray::prelude::*; +use numpy::{IntoPyArray, PyReadonlyArray1, PyReadonlyArray2}; + +use crate::two_qubit_decompose::ONE_QUBIT_IDENTITY; + +/// Find special unitary matrix that maps [c0,c1] to [r,0] or [0,r] if basis_state=0 or +/// basis_state=1 respectively +#[pyfunction] +pub fn reverse_qubit_state( + py: Python, + state: [Complex64; 2], + basis_state: usize, + epsilon: f64, +) -> PyObject { + reverse_qubit_state_inner(&state, basis_state, epsilon) + .into_pyarray_bound(py) + .into() +} + +#[inline(always)] +fn l2_norm(vec: &[Complex64]) -> f64 { + vec.iter() + .fold(0., |acc, elem| acc + elem.norm_sqr()) + .sqrt() +} + +fn reverse_qubit_state_inner( + state: &[Complex64; 2], + basis_state: usize, + epsilon: f64, +) -> Array2 { + let r = l2_norm(state); + let r_inv = 1. / r; + if r < epsilon { + Array2::eye(2) + } else if basis_state == 0 { + array![ + [state[0].conj() * r_inv, state[1].conj() * r_inv], + [-state[1] * r_inv, state[0] * r_inv], + ] + } else { + array![ + [-state[1] * r_inv, state[0] * r_inv], + [state[0].conj() * r_inv, state[1].conj() * r_inv], + ] + } +} + +/// This method finds the single-qubit gates for a UCGate to disentangle a qubit: +/// we consider the n-qubit state v[:,0] starting with k zeros (in the computational basis). +/// The qubit with label n-s-1 is disentangled into the basis state k_s(k,s). + +#[pyfunction] +pub fn find_squs_for_disentangling( + py: Python, + v: PyReadonlyArray2, + k: usize, + s: usize, + epsilon: f64, + n: usize, +) -> Vec { + let v = v.as_array(); + let k_prime = 0; + let i_start = if b(k, s + 1) == 0 { + a(k, s + 1) + } else { + a(k, s + 1) + 1 + }; + let mut output: Vec> = (0..i_start).map(|_| Array2::eye(2)).collect(); + let mut squs: Vec> = (i_start..2_usize.pow((n - s - 1) as u32)) + .map(|i| { + reverse_qubit_state_inner( + &[ + v[[2 * i * 2_usize.pow(s as u32) + b(k, s), k_prime]], + v[[(2 * i + 1) * 2_usize.pow(s as u32) + b(k, s), k_prime]], + ], + k_s(k, s), + epsilon, + ) + }) + .collect(); + output.append(&mut squs); + output + .into_iter() + .map(|x| x.into_pyarray_bound(py).into()) + .collect() +} + +#[pyfunction] +pub fn apply_ucg( + py: Python, + m: PyReadonlyArray2, + k: usize, + single_qubit_gates: Vec>, +) -> PyObject { + let mut m = m.as_array().to_owned(); + let shape = m.shape(); + let num_qubits = shape[0].ilog2(); + let num_col = shape[1]; + let spacing: usize = 2_usize.pow(num_qubits - k as u32 - 1); + for j in 0..2_usize.pow(num_qubits - 1) { + let i = (j / spacing) * spacing + j; + let gate_index = i / (2_usize.pow(num_qubits - k as u32)); + for col in 0..num_col { + let gate = single_qubit_gates[gate_index].as_array(); + let a = m[[i, col]]; + let b = m[[i + spacing, col]]; + m[[i, col]] = gate[[0, 0]] * a + gate[[0, 1]] * b; + m[[i + spacing, col]] = gate[[1, 0]] * a + gate[[1, 1]] * b; + } + } + m.into_pyarray_bound(py).into() +} + +#[inline(always)] +fn bin_to_int(bin: &[u8]) -> usize { + bin.iter() + .fold(0_usize, |acc, digit| (acc << 1) + *digit as usize) +} + +#[pyfunction] +pub fn apply_diagonal_gate( + py: Python, + m: PyReadonlyArray2, + action_qubit_labels: Vec, + diag: PyReadonlyArray1, +) -> PyResult { + let diag = diag.as_slice()?; + let mut m = m.as_array().to_owned(); + let shape = m.shape(); + let num_qubits = shape[0].ilog2(); + let num_col = shape[1]; + for state in std::iter::repeat([0_u8, 1_u8]) + .take(num_qubits as usize) + .multi_cartesian_product() + { + let diag_index = action_qubit_labels + .iter() + .fold(0_usize, |acc, i| (acc << 1) + state[*i] as usize); + let i = bin_to_int(&state); + for j in 0..num_col { + m[[i, j]] = diag[diag_index] * m[[i, j]] + } + } + Ok(m.into_pyarray_bound(py).into()) +} + +#[pyfunction] +pub fn apply_diagonal_gate_to_diag( + mut m_diagonal: Vec, + action_qubit_labels: Vec, + diag: PyReadonlyArray1, + num_qubits: usize, +) -> PyResult> { + let diag = diag.as_slice()?; + if m_diagonal.is_empty() { + return Ok(m_diagonal); + } + for state in std::iter::repeat([0_u8, 1_u8]) + .take(num_qubits) + .multi_cartesian_product() + .take(m_diagonal.len()) + { + let diag_index = action_qubit_labels + .iter() + .fold(0_usize, |acc, i| (acc << 1) + state[*i] as usize); + let i = bin_to_int(&state); + m_diagonal[i] *= diag[diag_index] + } + Ok(m_diagonal) +} + +/// Helper method for _apply_multi_controlled_gate. This constructs the basis states the MG gate +/// is acting on for a specific state state_free of the qubits we neither control nor act on +fn construct_basis_states( + state_free: &[u8], + control_set: &HashSet, + target_label: usize, +) -> [usize; 2] { + let size = state_free.len() + control_set.len() + 1; + let mut e1: usize = 0; + let mut e2: usize = 0; + let mut j = 0; + for i in 0..size { + e1 <<= 1; + e2 <<= 1; + if control_set.contains(&i) { + e1 += 1; + e2 += 1; + } else if i == target_label { + e2 += 1; + } else { + assert!(j <= 1); + e1 += state_free[j] as usize; + e2 += state_free[j] as usize; + j += 1 + } + } + [e1, e2] +} + +#[pyfunction] +pub fn apply_multi_controlled_gate( + py: Python, + m: PyReadonlyArray2, + control_labels: Vec, + target_label: usize, + gate: PyReadonlyArray2, +) -> PyObject { + let mut m = m.as_array().to_owned(); + let gate = gate.as_array(); + let shape = m.shape(); + let num_qubits = shape[0].ilog2(); + let num_col = shape[1]; + let free_qubits = num_qubits as usize - control_labels.len() - 1; + let control_set: HashSet = control_labels.into_iter().collect(); + if free_qubits == 0 { + let [e1, e2] = construct_basis_states(&[], &control_set, target_label); + for i in 0..num_col { + let temp: Vec<_> = gate + .dot(&aview2(&[[m[[e1, i]]], [m[[e2, i]]]])) + .into_iter() + .take(2) + .collect(); + m[[e1, i]] = temp[0]; + m[[e2, i]] = temp[1]; + } + return m.into_pyarray_bound(py).into(); + } + for state_free in std::iter::repeat([0_u8, 1_u8]) + .take(free_qubits) + .multi_cartesian_product() + { + let [e1, e2] = construct_basis_states(&state_free, &control_set, target_label); + for i in 0..num_col { + let temp: Vec<_> = gate + .dot(&aview2(&[[m[[e1, i]]], [m[[e2, i]]]])) + .into_iter() + .take(2) + .collect(); + m[[e1, i]] = temp[0]; + m[[e2, i]] = temp[1]; + } + } + m.into_pyarray_bound(py).into() +} + +#[pyfunction] +pub fn ucg_is_identity_up_to_global_phase( + single_qubit_gates: Vec>, + epsilon: f64, +) -> bool { + let global_phase: Complex64 = if single_qubit_gates[0].as_array()[[0, 0]].abs() >= epsilon { + single_qubit_gates[0].as_array()[[0, 0]].finv() + } else { + return false; + }; + for raw_gate in single_qubit_gates { + let gate = raw_gate.as_array(); + if !abs_diff_eq!( + gate.mapv(|x| x * global_phase), + aview2(&ONE_QUBIT_IDENTITY), + epsilon = 1e-8 // Default tolerance from numpy for allclose() + ) { + return false; + } + } + true +} + +#[pyfunction] +fn diag_is_identity_up_to_global_phase(diag: Vec, epsilon: f64) -> bool { + let global_phase: Complex64 = if diag[0].abs() >= epsilon { + diag[0].finv() + } else { + return false; + }; + for d in diag { + if (global_phase * d - 1.0).abs() >= epsilon { + return false; + } + } + true +} + +#[pyfunction] +pub fn merge_ucgate_and_diag( + py: Python, + single_qubit_gates: Vec>, + diag: Vec, +) -> Vec { + single_qubit_gates + .iter() + .enumerate() + .map(|(i, raw_gate)| { + let gate = raw_gate.as_array(); + let res = aview2(&[ + [diag[2 * i], Complex64::new(0., 0.)], + [Complex64::new(0., 0.), diag[2 * i + 1]], + ]) + .dot(&gate); + res.into_pyarray_bound(py).into() + }) + .collect() +} + +#[inline(always)] +#[pyfunction] +fn k_s(k: usize, s: usize) -> usize { + if k == 0 { + 0 + } else { + let filter = 1 << s; + k.bitand(filter) >> s + } +} + +#[inline(always)] +#[pyfunction] +fn a(k: usize, s: usize) -> usize { + k / 2_usize.pow(s as u32) +} + +#[inline(always)] +#[pyfunction] +fn b(k: usize, s: usize) -> usize { + k - (a(k, s) * 2_usize.pow(s as u32)) +} + +#[pymodule] +pub fn isometry(m: &Bound) -> PyResult<()> { + m.add_wrapped(wrap_pyfunction!(diag_is_identity_up_to_global_phase))?; + m.add_wrapped(wrap_pyfunction!(find_squs_for_disentangling))?; + m.add_wrapped(wrap_pyfunction!(reverse_qubit_state))?; + m.add_wrapped(wrap_pyfunction!(apply_ucg))?; + m.add_wrapped(wrap_pyfunction!(apply_diagonal_gate))?; + m.add_wrapped(wrap_pyfunction!(apply_diagonal_gate_to_diag))?; + m.add_wrapped(wrap_pyfunction!(apply_multi_controlled_gate))?; + m.add_wrapped(wrap_pyfunction!(ucg_is_identity_up_to_global_phase))?; + m.add_wrapped(wrap_pyfunction!(merge_ucgate_and_diag))?; + m.add_wrapped(wrap_pyfunction!(a))?; + m.add_wrapped(wrap_pyfunction!(b))?; + m.add_wrapped(wrap_pyfunction!(k_s))?; + Ok(()) +} diff --git a/crates/accelerate/src/lib.rs b/crates/accelerate/src/lib.rs index 6ae8ab096383..68f21698555e 100644 --- a/crates/accelerate/src/lib.rs +++ b/crates/accelerate/src/lib.rs @@ -19,6 +19,7 @@ pub mod dense_layout; pub mod edge_collections; pub mod error_map; pub mod euler_one_qubit_decomposer; +pub mod isometry; pub mod nlayout; pub mod optimize_1q_gates; pub mod pauli_exp_val; @@ -29,9 +30,14 @@ pub mod sparse_pauli_op; pub mod stochastic_swap; pub mod target; pub mod two_qubit_decompose; +pub mod uc_gate; pub mod utils; pub mod vf2_layout; +mod rayon_ext; +#[cfg(test)] +mod test; + #[inline] pub fn getenv_use_multiple_threads() -> bool { let parallel_context = env::var("QISKIT_IN_PARALLEL") diff --git a/crates/accelerate/src/nlayout.rs b/crates/accelerate/src/nlayout.rs index 53c0a468e822..1a0b73b25fed 100644 --- a/crates/accelerate/src/nlayout.rs +++ b/crates/accelerate/src/nlayout.rs @@ -91,7 +91,7 @@ impl VirtualQubit { /// physical qubit index on the coupling graph. /// logical_qubits (int): The number of logical qubits in the layout /// physical_qubits (int): The number of physical qubits in the layout -#[pyclass(module = "qiskit._accelerate.stochastic_swap")] +#[pyclass(module = "qiskit._accelerate.nlayout")] #[derive(Clone, Debug)] pub struct NLayout { virt_to_phys: Vec, @@ -117,13 +117,13 @@ impl NLayout { res } - fn __getstate__(&self) -> (Vec, Vec) { - (self.virt_to_phys.clone(), self.phys_to_virt.clone()) - } - - fn __setstate__(&mut self, state: (Vec, Vec)) { - self.virt_to_phys = state.0; - self.phys_to_virt = state.1; + fn __reduce__(&self, py: Python) -> PyResult> { + Ok(( + py.get_type_bound::() + .getattr("from_virtual_to_physical")?, + (self.virt_to_phys.to_object(py),), + ) + .into_py(py)) } /// Return the layout mapping. diff --git a/crates/accelerate/src/rayon_ext.rs b/crates/accelerate/src/rayon_ext.rs new file mode 100644 index 000000000000..af914a86d414 --- /dev/null +++ b/crates/accelerate/src/rayon_ext.rs @@ -0,0 +1,171 @@ +// This code is part of Qiskit. +// +// (C) Copyright IBM 2023 +// +// This code is licensed under the Apache License, Version 2.0. You may +// obtain a copy of this license in the LICENSE.txt file in the root directory +// of this source tree or at http://www.apache.org/licenses/LICENSE-2.0. +// +// Any modifications or derivative works of this code must retain this +// copyright notice, and modified files need to carry a notice indicating +// that they have been altered from the originals. + +//! Extension structs for use with Rayon parallelism. + +// See https://github.com/rayon-rs/rayon/blob/v1.10.0/src/iter/plumbing/README.md (or a newer +// version) for more of an explanation of how Rayon's plumbing works. + +use rayon::iter::plumbing::*; +use rayon::prelude::*; + +pub trait ParallelSliceMutExt: ParallelSliceMut { + /// Create a parallel iterator over mutable chunks of uneven lengths for this iterator. + /// + /// # Panics + /// + /// Panics if the sums of the given lengths do not add up to the length of the slice. + #[track_caller] + fn par_uneven_chunks_mut<'len, 'data>( + &'data mut self, + chunk_lengths: &'len [usize], + ) -> ParUnevenChunksMut<'len, 'data, T> { + let mut_slice = self.as_parallel_slice_mut(); + let chunk_sum = chunk_lengths.iter().sum::(); + let slice_len = mut_slice.len(); + if chunk_sum != slice_len { + panic!("given slices of total size {chunk_sum} for a chunk of length {slice_len}"); + } + ParUnevenChunksMut { + chunk_lengths, + data: mut_slice, + } + } +} + +impl ParallelSliceMutExt for S where S: ParallelSliceMut {} + +/// Very similar to Rayon's [rayon::slice::ChunksMut], except that the lengths of the individual +/// chunks are arbitrary, provided they sum to the total length of the slice. +#[derive(Debug)] +pub struct ParUnevenChunksMut<'len, 'data, T> { + chunk_lengths: &'len [usize], + data: &'data mut [T], +} + +impl<'len, 'data, T: Send + 'data> ParallelIterator for ParUnevenChunksMut<'len, 'data, T> { + type Item = &'data mut [T]; + + #[track_caller] + fn drive_unindexed>(self, consumer: C) -> C::Result { + bridge(self, consumer) + } +} + +impl<'len, 'data, T: Send + 'data> IndexedParallelIterator for ParUnevenChunksMut<'len, 'data, T> { + #[track_caller] + fn drive>(self, consumer: C) -> C::Result { + bridge(self, consumer) + } + + fn len(&self) -> usize { + self.chunk_lengths.len() + } + + #[track_caller] + fn with_producer>(self, callback: CB) -> CB::Output { + callback.callback(UnevenChunksMutProducer { + chunk_lengths: self.chunk_lengths, + data: self.data, + }) + } +} + +struct UnevenChunksMutProducer<'len, 'data, T: Send> { + chunk_lengths: &'len [usize], + data: &'data mut [T], +} + +impl<'len, 'data, T: Send + 'data> Producer for UnevenChunksMutProducer<'len, 'data, T> { + type Item = &'data mut [T]; + type IntoIter = UnevenChunksMutIter<'len, 'data, T>; + + fn into_iter(self) -> Self::IntoIter { + Self::IntoIter::new(self.chunk_lengths, self.data) + } + + #[track_caller] + fn split_at(self, index: usize) -> (Self, Self) { + // Technically quadratic for a full-depth split, but let's worry about that later if needed. + let data_mid = self.chunk_lengths[..index].iter().sum(); + let (chunks_left, chunks_right) = self.chunk_lengths.split_at(index); + let (data_left, data_right) = self.data.split_at_mut(data_mid); + ( + Self { + chunk_lengths: chunks_left, + data: data_left, + }, + Self { + chunk_lengths: chunks_right, + data: data_right, + }, + ) + } +} + +#[must_use = "iterators do nothing unless consumed"] +struct UnevenChunksMutIter<'len, 'data, T> { + chunk_lengths: &'len [usize], + // The extra `Option` wrapper here is to satisfy the borrow checker while we're splitting the + // `data` reference. We need to consume `self`'s reference during the split before replacing + // it, which means we need to temporarily set the `data` ref to some unowned value. + // `Option<&mut [T]>` means we can replace it temporarily with the null reference, ensuring the + // mutable aliasing rules are always upheld. + data: Option<&'data mut [T]>, +} + +impl<'len, 'data, T> UnevenChunksMutIter<'len, 'data, T> { + fn new(chunk_lengths: &'len [usize], data: &'data mut [T]) -> Self { + Self { + chunk_lengths, + data: Some(data), + } + } +} + +impl<'len, 'data, T> Iterator for UnevenChunksMutIter<'len, 'data, T> { + type Item = &'data mut [T]; + + #[track_caller] + fn next(&mut self) -> Option { + if self.chunk_lengths.is_empty() { + return None; + } + let (out_data, rem_data) = self + .data + .take() + .unwrap() + .split_at_mut(self.chunk_lengths[0]); + self.chunk_lengths = &self.chunk_lengths[1..]; + self.data = Some(rem_data); + Some(out_data) + } + + fn size_hint(&self) -> (usize, Option) { + (self.chunk_lengths.len(), Some(self.chunk_lengths.len())) + } +} +impl<'len, 'data, T> ExactSizeIterator for UnevenChunksMutIter<'len, 'data, T> {} +impl<'len, 'data, T> DoubleEndedIterator for UnevenChunksMutIter<'len, 'data, T> { + #[track_caller] + fn next_back(&mut self) -> Option { + if self.chunk_lengths.is_empty() { + return None; + } + let pos = self.chunk_lengths.len() - 1; + let data_pos = self.data.as_ref().map(|x| x.len()).unwrap() - self.chunk_lengths[pos]; + let (rem_data, out_data) = self.data.take().unwrap().split_at_mut(data_pos); + self.chunk_lengths = &self.chunk_lengths[..pos]; + self.data = Some(rem_data); + Some(out_data) + } +} diff --git a/crates/accelerate/src/sparse_pauli_op.rs b/crates/accelerate/src/sparse_pauli_op.rs index c8e0f0fb9316..5d6a82df7940 100644 --- a/crates/accelerate/src/sparse_pauli_op.rs +++ b/crates/accelerate/src/sparse_pauli_op.rs @@ -10,16 +10,22 @@ // copyright notice, and modified files need to carry a notice indicating // that they have been altered from the originals. -use pyo3::exceptions::PyValueError; +use pyo3::exceptions::{PyRuntimeError, PyValueError}; use pyo3::prelude::*; +use pyo3::types::PyTuple; use pyo3::wrap_pyfunction; use pyo3::Python; +use numpy::prelude::*; +use numpy::{PyArray1, PyArray2, PyReadonlyArray1, PyReadonlyArray2, PyUntypedArrayMethods}; + use hashbrown::HashMap; use ndarray::{s, Array1, Array2, ArrayView1, ArrayView2, Axis}; use num_complex::Complex64; use num_traits::Zero; -use numpy::{IntoPyArray, PyArray1, PyArray2, PyReadonlyArray2, PyUntypedArrayMethods}; +use rayon::prelude::*; + +use crate::rayon_ext::*; /// Find the unique elements of an array. /// @@ -71,9 +77,49 @@ enum Pauli { Z, } +/// Pack a 2D array of Booleans into a given width. Returns an error if the input array is +/// too large to be packed into u64. +fn pack_bits(bool_arr: ArrayView2) -> Result, ()> { + let num_qubits = bool_arr.shape()[1]; + if num_qubits > (u64::BITS as usize) { + return Err(()); + } + let slack = num_qubits % 8; + let pack_row = |row: ArrayView1| -> u64 { + let mut val: u64 = 0; + let mut shift = 0; + for chunk in row.exact_chunks(8) { + val |= ((chunk[0] as u8 + | ((chunk[1] as u8) << 1) + | ((chunk[2] as u8) << 2) + | ((chunk[3] as u8) << 3) + | ((chunk[4] as u8) << 4) + | ((chunk[5] as u8) << 5) + | ((chunk[6] as u8) << 6) + | ((chunk[7] as u8) << 7)) as u64) + << shift; + shift += 8; + } + if slack > 0 { + for (i, b) in row + .slice(s![num_qubits - slack..num_qubits]) + .iter() + .enumerate() + { + val |= (*b as u64) << (shift + i); + } + } + val + }; + Ok(bool_arr + .axis_iter(Axis(0)) + .map(pack_row) + .collect::>()) +} + /// A complete ZX-convention representation of a Pauli decomposition. This is all the components /// necessary to construct a Qiskit-space :class:`.SparsePauliOp`, where :attr:`phases` is in the -/// ZX convention. +/// ZX convention. This class is just meant for interoperation between Rust and Python. #[pyclass(module = "qiskit._accelerate.sparse_pauli_op")] pub struct ZXPaulis { #[pyo3(get)] @@ -86,6 +132,196 @@ pub struct ZXPaulis { pub coeffs: Py>, } +#[pymethods] +impl ZXPaulis { + #[new] + fn __new__( + x: &Bound>, + z: &Bound>, + phases: &Bound>, + coeffs: &Bound>, + ) -> PyResult { + let &[num_ops, num_qubits] = x.shape() else { unreachable!("PyArray2 must be 2D") }; + if z.shape() != [num_ops, num_qubits] { + return Err(PyValueError::new_err(format!( + "'x' and 'z' have different shapes: {:?} and {:?}", + [num_ops, num_qubits], + z.shape() + ))); + } + if phases.len() != num_ops || coeffs.len() != num_ops { + return Err(PyValueError::new_err(format!( + "mismatched dimensions: 'x' and 'z' have {} operator(s), 'phase' has {} and 'coeffs' has {}", + num_ops, + phases.len(), + coeffs.len(), + ))); + } + + Ok(Self { + x: x.to_owned().unbind(), + z: z.to_owned().unbind(), + phases: phases.to_owned().unbind(), + coeffs: coeffs.to_owned().unbind(), + }) + } +} + +impl ZXPaulis { + /// Attempt to acquire a Rust-enforced Rust-only immutable borrow onto the underlying + /// Python-space data. This returns `None` if any of the underlying arrays already has a + /// mutable borrow taken out onto it. + pub fn try_readonly<'a, 'py>(&'a self, py: Python<'py>) -> Option> + where + 'a: 'py, + { + Some(ZXPaulisReadonly { + x: self.x.bind(py).try_readonly().ok()?, + z: self.z.bind(py).try_readonly().ok()?, + phases: self.phases.bind(py).try_readonly().ok()?, + coeffs: self.coeffs.bind(py).try_readonly().ok()?, + }) + } +} + +/// Intermediate structure that represents readonly views onto the Python-space sparse Pauli data. +/// This is used in the chained methods so that the syntactical temporary lifetime extension can +/// occur; we can't have the readonly array temporaries only live within a method that returns +/// [ZXPaulisView], because otherwise the lifetimes of the [PyReadonlyArray] elements will be too +/// short. +pub struct ZXPaulisReadonly<'a> { + x: PyReadonlyArray2<'a, bool>, + z: PyReadonlyArray2<'a, bool>, + phases: PyReadonlyArray1<'a, u8>, + coeffs: PyReadonlyArray1<'a, Complex64>, +} + +impl ZXPaulisReadonly<'_> { + /// Get a [ndarray] view of the data of these [rust-numpy] objects. + fn as_array(&self) -> ZXPaulisView { + ZXPaulisView { + x: self.x.as_array(), + z: self.z.as_array(), + phases: self.phases.as_array(), + coeffs: self.coeffs.as_array(), + } + } +} + +/// Intermediate structure that represents [ndarray] views onto the Python-space sparse Pauli data +/// in the ZX convention. This can be used directly by Rust methods if desired, or bit-packed into +/// a matrix-representation format [MatrixCompressedPaulis] using the [compress] method. +pub struct ZXPaulisView<'py> { + x: ArrayView2<'py, bool>, + z: ArrayView2<'py, bool>, + phases: ArrayView1<'py, u8>, + coeffs: ArrayView1<'py, Complex64>, +} + +impl<'py> ZXPaulisView<'py> { + /// The number of qubits this operator acts on. + pub fn num_qubits(&self) -> usize { + self.x.shape()[1] + } + + /// Convert the ZX representation into a bitpacked internal representation. See the + /// documentation of [MatrixCompressedPaulis] for details of the changes to the Pauli + /// convention and representation. + pub fn matrix_compress(&self) -> PyResult { + let num_qubits = self.num_qubits(); + // This is obviously way too big for a dense operator, and SciPy limits us to using `i64` + // for the index and indptr types, so (except for some synthetic cases), it's not possible + // for us to work with a larger matrix than this. + if num_qubits > 63 { + return Err(PyValueError::new_err(format!( + "{num_qubits} is too many qubits to convert to a matrix" + ))); + } + if num_qubits == 0 { + return Ok(MatrixCompressedPaulis { + num_qubits: 0, + x_like: Vec::new(), + z_like: Vec::new(), + coeffs: self.coeffs.to_vec(), + }); + } + let x_like = pack_bits(self.x).expect("x should already be validated as <64 qubits"); + let z_like = pack_bits(self.z).expect("z should already be validated as <64 qubits"); + let coeffs = x_like + .iter() + .zip(z_like.iter()) + .zip(self.phases.iter().zip(self.coeffs.iter())) + .map(|((xs, zs), (&phase, &coeff))| { + let ys = (xs & zs).count_ones(); + match (phase as u32 + ys) % 4 { + 0 => coeff, + 1 => Complex64::new(coeff.im, -coeff.re), + 2 => Complex64::new(-coeff.re, -coeff.im), + 3 => Complex64::new(-coeff.im, coeff.re), + _ => unreachable!(), + } + }) + .collect::>(); + Ok(MatrixCompressedPaulis { + num_qubits: num_qubits as u8, + x_like, + z_like, + coeffs, + }) + } +} + +/// Temporary bit-compressed storage of the Pauli string. The [coeffs] are reinterpreted to +/// include the old `phase` component in them directly, plus the factors of `-i` stemming from `Y` +/// components. The result is that the [coeffs] now more directly represent entries in a matrix, +/// while [x_like] and [z_like] are no longer direct measures of `X` and `Z` elements (as in the ZX +/// convention), but are instead only a marker of the column and parity respectively. +/// +/// In other words, `row_num ^ x_like` gives the column number of an element, while +/// `(row_num & z_like).count_ones()` counts multiplicative factors of `-1` to be applied to +/// `coeff` when placing it at `(row_num, col_num)` in an output matrix. +pub struct MatrixCompressedPaulis { + num_qubits: u8, + x_like: Vec, + z_like: Vec, + coeffs: Vec, +} + +impl MatrixCompressedPaulis { + /// The number of qubits this operator acts on. + pub fn num_qubits(&self) -> usize { + self.num_qubits as usize + } + + /// The number of explicitly stored operators in the sum. + pub fn num_ops(&self) -> usize { + self.coeffs.len() + } + + /// Sum coefficients that correspond to the same Pauli operator; this reduces the number of + /// explicitly stored operations, if there are duplicates. After the summation, any terms that + /// have become zero are dropped. + pub fn combine(&mut self) { + let mut hash_table = HashMap::<(u64, u64), Complex64>::with_capacity(self.coeffs.len()); + for (key, coeff) in self + .x_like + .drain(..) + .zip(self.z_like.drain(..)) + .zip(self.coeffs.drain(..)) + { + *hash_table.entry(key).or_insert(Complex64::new(0.0, 0.0)) += coeff; + } + for ((x, z), coeff) in hash_table { + if coeff == Complex64::new(0.0, 0.0) { + continue; + } + self.x_like.push(x); + self.z_like.push(z); + self.coeffs.push(coeff); + } + } +} + /// Decompose a dense complex operator into the symplectic Pauli representation in the /// ZX-convention. /// @@ -257,10 +493,385 @@ fn decompose_dense_inner( ); } +/// Convert the given [ZXPaulis] object to a dense 2D Numpy matrix. +#[pyfunction] +#[pyo3(signature = (/, paulis, force_serial=false))] +pub fn to_matrix_dense<'py>( + py: Python<'py>, + paulis: &ZXPaulis, + force_serial: bool, +) -> PyResult>> { + let paulis_readonly = paulis + .try_readonly(py) + .ok_or_else(|| PyRuntimeError::new_err("could not produce a safe view onto the data"))?; + let mut paulis = paulis_readonly.as_array().matrix_compress()?; + paulis.combine(); + let side = 1usize << paulis.num_qubits(); + let parallel = !force_serial && crate::getenv_use_multiple_threads(); + let out = to_matrix_dense_inner(&paulis, parallel); + PyArray1::from_vec_bound(py, out).reshape([side, side]) +} + +/// Inner worker of the Python-exposed [to_matrix_dense]. This is separate primarily to allow +/// Rust-space unit testing even if Python isn't available for execution. This returns a C-ordered +/// [Vec] of the 2D matrix. +fn to_matrix_dense_inner(paulis: &MatrixCompressedPaulis, parallel: bool) -> Vec { + let side = 1usize << paulis.num_qubits(); + #[allow(clippy::uninit_vec)] + let mut out = { + let mut out = Vec::with_capacity(side * side); + // SAFETY: we iterate through the vec in chunks of `side`, and start each row by filling it + // with zeros before ever reading from it. It's fine to overwrite the uninitialised memory + // because `Complex64: !Drop`. + unsafe { out.set_len(side * side) }; + out + }; + let write_row = |(i_row, row): (usize, &mut [Complex64])| { + // Doing the initialisation here means that when we're in parallel contexts, we do the + // zeroing across the whole threadpool. This also seems to give a speed-up in serial + // contexts, but I don't understand that. ---Jake + row.fill(Complex64::new(0.0, 0.0)); + for ((&x_like, &z_like), &coeff) in paulis + .x_like + .iter() + .zip(paulis.z_like.iter()) + .zip(paulis.coeffs.iter()) + { + // Technically this discards part of the storable data, but in practice, a dense + // operator with more than 32 qubits needs in the region of 1 ZiB memory. We still use + // `u64` to help sparse-matrix construction, though. + let coeff = if (i_row as u32 & z_like as u32).count_ones() % 2 == 0 { + coeff + } else { + -coeff + }; + row[i_row ^ (x_like as usize)] += coeff; + } + }; + if parallel { + out.par_chunks_mut(side).enumerate().for_each(write_row); + } else { + out.chunks_mut(side).enumerate().for_each(write_row); + } + out +} + +type CSRData = (Vec, Vec, Vec); +type ToCSRData = fn(&MatrixCompressedPaulis) -> CSRData; + +/// Convert the given [ZXPaulis] object to the three-array CSR form. The output type of the +/// `indices` and `indptr` matrices will be `i32` if that type is guaranteed to be able to hold the +/// number of non-zeros, otherwise it will be `i64`. Signed types are used to match Scipy. `i32` +/// is preferentially returned, because Scipy will downcast to this on `csr_matrix` construction if +/// all array elements would fit. For large operators with significant cancellation, it is +/// possible that `i64` will be returned when `i32` would suffice, but this will not cause +/// unsoundness, just a copy overhead when constructing the Scipy matrix. +#[pyfunction] +#[pyo3(signature = (/, paulis, force_serial=false))] +pub fn to_matrix_sparse( + py: Python, + paulis: &ZXPaulis, + force_serial: bool, +) -> PyResult> { + let paulis_readonly = paulis + .try_readonly(py) + .ok_or_else(|| PyRuntimeError::new_err("could not produce a safe view onto the data"))?; + let mut paulis = paulis_readonly.as_array().matrix_compress()?; + paulis.combine(); + + // This deliberately erases the Rust types in the output so we can return either 32- or 64-bit + // indices as appropriate without breaking Rust's typing. + fn to_py_tuple(py: Python, csr_data: CSRData) -> Py + where + T: numpy::Element, + { + let (values, indices, indptr) = csr_data; + ( + PyArray1::from_vec_bound(py, values), + PyArray1::from_vec_bound(py, indices), + PyArray1::from_vec_bound(py, indptr), + ) + .into_py(py) + } + + // Pessimistic estimation of whether we can fit in `i32`. If there's any risk of overflowing + // `i32`, we use `i64`, but Scipy will always try to downcast to `i32`, so we try to match it. + let max_entries_per_row = (paulis.num_ops() as u64).min(1u64 << (paulis.num_qubits() - 1)); + let use_32_bit = + max_entries_per_row.saturating_mul(1u64 << paulis.num_qubits()) <= (i32::MAX as u64); + if use_32_bit { + let to_sparse: ToCSRData = if crate::getenv_use_multiple_threads() && !force_serial { + to_matrix_sparse_parallel_32 + } else { + to_matrix_sparse_serial_32 + }; + Ok(to_py_tuple(py, to_sparse(&paulis))) + } else { + let to_sparse: ToCSRData = if crate::getenv_use_multiple_threads() && !force_serial { + to_matrix_sparse_parallel_64 + } else { + to_matrix_sparse_serial_64 + }; + Ok(to_py_tuple(py, to_sparse(&paulis))) + } +} + +/// Copy several slices into a single flat vec, in parallel. Allocates a temporary `Vec` of +/// the same length as the input slice to track the chunking. +fn copy_flat_parallel(slices: &[U]) -> Vec +where + T: Copy + Send + Sync, + U: AsRef<[T]> + Sync, +{ + let lens = slices + .iter() + .map(|slice| slice.as_ref().len()) + .collect::>(); + let size = lens.iter().sum(); + #[allow(clippy::uninit_vec)] + let mut out = { + let mut out = Vec::with_capacity(size); + // SAFETY: we've just calculated that the lengths of the given vecs add up to the right + // thing, and we're about to copy in the data from each of them into this uninitialised + // array. It's guaranteed safe to write `T` to the uninitialised space, because `Copy` + // implies `!Drop`. + unsafe { out.set_len(size) }; + out + }; + out.par_uneven_chunks_mut(&lens) + .zip(slices.par_iter().map(|x| x.as_ref())) + .for_each(|(out_slice, in_slice)| out_slice.copy_from_slice(in_slice)); + out +} + +macro_rules! impl_to_matrix_sparse { + ($serial_fn:ident, $parallel_fn:ident, $int_ty:ty, $uint_ty:ty $(,)?) => { + /// Build CSR data arrays for the matrix-compressed set of the Pauli operators, using a + /// completely serial strategy. + fn $serial_fn(paulis: &MatrixCompressedPaulis) -> CSRData<$int_ty> { + let side = 1 << paulis.num_qubits(); + let num_ops = paulis.num_ops(); + if num_ops == 0 { + return (vec![], vec![], vec![0; side + 1]); + } + + let mut order = (0..num_ops).collect::>(); + let mut values = Vec::::with_capacity(side * (num_ops + 1) / 2); + let mut indices = Vec::<$int_ty>::with_capacity(side * (num_ops + 1) / 2); + let mut indptr: Vec<$int_ty> = vec![0; side + 1]; + let mut nnz = 0; + for i_row in 0..side { + order.sort_unstable_by(|&a, &b| { + ((i_row as $uint_ty) ^ (paulis.x_like[a] as $uint_ty)) + .cmp(&((i_row as $uint_ty) ^ (paulis.x_like[b] as $uint_ty))) + }); + let mut running = Complex64::new(0.0, 0.0); + let mut prev_index = i_row ^ (paulis.x_like[order[0]] as usize); + for (x_like, z_like, coeff) in order + .iter() + .map(|&i| (paulis.x_like[i], paulis.z_like[i], paulis.coeffs[i])) + { + let coeff = + if ((i_row as $uint_ty) & (z_like as $uint_ty)).count_ones() % 2 == 0 { + coeff + } else { + -coeff + }; + let index = i_row ^ (x_like as usize); + if index == prev_index { + running += coeff; + } else { + nnz += 1; + values.push(running); + indices.push(prev_index as $int_ty); + running = coeff; + prev_index = index; + } + } + nnz += 1; + values.push(running); + indices.push(prev_index as $int_ty); + indptr[i_row + 1] = nnz; + } + (values, indices, indptr) + } + + /// Build CSR data arrays for the matrix-compressed set of the Pauli operators, using a + /// parallel strategy. This involves more data copying than the serial form, so there is a + /// nontrivial amount of parallel overhead. + fn $parallel_fn(paulis: &MatrixCompressedPaulis) -> CSRData<$int_ty> { + let side = 1 << paulis.num_qubits(); + let num_ops = paulis.num_ops(); + if num_ops == 0 { + return (vec![], vec![], vec![0; side + 1]); + } + + let mut indptr = Vec::<$int_ty>::with_capacity(side + 1); + indptr.push(0); + // SAFETY: we allocate the space for the `indptr` array here, then each thread writes + // in the number of nonzero entries for each row it was responsible for. We know ahead + // of time exactly how many entries we need (one per row, plus an explicit 0 to start). + // It's also important that `$int_ty` does not implement `Drop`, since otherwise it + // will be called on uninitialised memory (all primitive int types satisfy this). + unsafe { + indptr.set_len(side + 1); + } + + // The parallel overhead from splitting a subtask is fairly high (allocating and + // potentially growing a couple of vecs), so we're trading off some of Rayon's ability + // to keep threads busy by subdivision with minimising overhead; we're setting the + // chunk size such that the iterator will have as many elements as there are threads. + let num_threads = rayon::current_num_threads(); + let chunk_size = (side + num_threads - 1) / num_threads; + let mut values_chunks = Vec::with_capacity(num_threads); + let mut indices_chunks = Vec::with_capacity(num_threads); + // SAFETY: the slice here is uninitialised data; it must not be read. + indptr[1..] + .par_chunks_mut(chunk_size) + .enumerate() + .map(|(i, indptr_chunk)| { + let start = chunk_size * i; + let end = (chunk_size * (i + 1)).min(side); + let mut order = (0..num_ops).collect::>(); + // Since we compressed the Paulis by summing equal elements, we're + // lower-bounded on the number of elements per row by this value, up to + // cancellations. This should be a reasonable trade-off between sometimes + // expandin the vector and overallocation. + let mut values = + Vec::::with_capacity(chunk_size * (num_ops + 1) / 2); + let mut indices = Vec::<$int_ty>::with_capacity(chunk_size * (num_ops + 1) / 2); + let mut nnz = 0; + for i_row in start..end { + order.sort_unstable_by(|&a, &b| { + (i_row as $uint_ty ^ paulis.x_like[a] as $uint_ty) + .cmp(&(i_row as $uint_ty ^ paulis.x_like[b] as $uint_ty)) + }); + let mut running = Complex64::new(0.0, 0.0); + let mut prev_index = i_row ^ (paulis.x_like[order[0]] as usize); + for (x_like, z_like, coeff) in order + .iter() + .map(|&i| (paulis.x_like[i], paulis.z_like[i], paulis.coeffs[i])) + { + let coeff = + if (i_row as $uint_ty & z_like as $uint_ty).count_ones() % 2 == 0 { + coeff + } else { + -coeff + }; + let index = i_row ^ (x_like as usize); + if index == prev_index { + running += coeff; + } else { + nnz += 1; + values.push(running); + indices.push(prev_index as $int_ty); + running = coeff; + prev_index = index; + } + } + nnz += 1; + values.push(running); + indices.push(prev_index as $int_ty); + // When we write it, this is a cumulative `nnz` _within the chunk_. We + // turn that into a proper cumulative sum in serial afterwards. + indptr_chunk[i_row - start] = nnz; + } + (values, indices) + }) + .unzip_into_vecs(&mut values_chunks, &mut indices_chunks); + // Turn the chunkwise nnz counts into absolute nnz counts. + let mut start_nnz = 0usize; + let chunk_nnz = values_chunks + .iter() + .map(|chunk| { + let prev = start_nnz; + start_nnz += chunk.len(); + prev as $int_ty + }) + .collect::>(); + indptr[1..] + .par_chunks_mut(chunk_size) + .zip(chunk_nnz) + .for_each(|(indptr_chunk, start_nnz)| { + indptr_chunk.iter_mut().for_each(|nnz| *nnz += start_nnz); + }); + // Concatenate the chunkwise values and indices togther. + let values = copy_flat_parallel(&values_chunks); + let indices = copy_flat_parallel(&indices_chunks); + (values, indices, indptr) + } + }; +} + +impl_to_matrix_sparse!( + to_matrix_sparse_serial_32, + to_matrix_sparse_parallel_32, + i32, + u32 +); +impl_to_matrix_sparse!( + to_matrix_sparse_serial_64, + to_matrix_sparse_parallel_64, + i64, + u64 +); + #[pymodule] pub fn sparse_pauli_op(m: &Bound) -> PyResult<()> { m.add_wrapped(wrap_pyfunction!(unordered_unique))?; m.add_wrapped(wrap_pyfunction!(decompose_dense))?; + m.add_wrapped(wrap_pyfunction!(to_matrix_dense))?; + m.add_wrapped(wrap_pyfunction!(to_matrix_sparse))?; m.add_class::()?; Ok(()) } + +#[cfg(test)] +mod tests { + use super::*; + use crate::test::*; + + // The purpose of these tests is more about exercising the `unsafe` code; we test for full + // correctness from Python space. + + fn example_paulis() -> MatrixCompressedPaulis { + MatrixCompressedPaulis { + num_qubits: 4, + x_like: vec![0b0000, 0b0001, 0b0010, 0b1100, 0b1010, 0b0000], + z_like: vec![0b1000, 0b0110, 0b1001, 0b0100, 0b1010, 0b1000], + // Deliberately using multiples of small powers of two so the floating-point addition + // of them is associative. + coeffs: vec![ + Complex64::new(0.25, 0.5), + Complex64::new(0.125, 0.25), + Complex64::new(0.375, 0.125), + Complex64::new(-0.375, 0.0625), + Complex64::new(-0.5, -0.25), + ], + } + } + + #[test] + fn dense_threaded_and_serial_equal() { + let paulis = example_paulis(); + let parallel = in_scoped_thread_pool(|| to_matrix_dense_inner(&paulis, true)).unwrap(); + let serial = to_matrix_dense_inner(&paulis, false); + assert_eq!(parallel, serial); + } + + #[test] + fn sparse_threaded_and_serial_equal_32() { + let paulis = example_paulis(); + let parallel = in_scoped_thread_pool(|| to_matrix_sparse_parallel_32(&paulis)).unwrap(); + let serial = to_matrix_sparse_serial_32(&paulis); + assert_eq!(parallel, serial); + } + + #[test] + fn sparse_threaded_and_serial_equal_64() { + let paulis = example_paulis(); + let parallel = in_scoped_thread_pool(|| to_matrix_sparse_parallel_64(&paulis)).unwrap(); + let serial = to_matrix_sparse_serial_64(&paulis); + assert_eq!(parallel, serial); + } +} diff --git a/crates/accelerate/src/test.rs b/crates/accelerate/src/test.rs new file mode 100644 index 000000000000..dac51499202b --- /dev/null +++ b/crates/accelerate/src/test.rs @@ -0,0 +1,24 @@ +// This code is part of Qiskit. +// +// (C) Copyright IBM 2023 +// +// This code is licensed under the Apache License, Version 2.0. You may +// obtain a copy of this license in the LICENSE.txt file in the root directory +// of this source tree or at http://www.apache.org/licenses/LICENSE-2.0. +// +// Any modifications or derivative works of this code must retain this +// copyright notice, and modified files need to carry a notice indicating +// that they have been altered from the originals. + +/// Helper for tests that involve calling Rayon code from within Miri. This runs the given +/// function in a scoped threadpool, which is then immediately dropped. This means that Miri will +/// not complain about the global (static) threads that are not joined when the process exits, +/// which is deliberate. +pub fn in_scoped_thread_pool(worker: F) -> Result +where + T: Send, + F: FnOnce() -> T + Send, +{ + ::rayon::ThreadPoolBuilder::new() + .build_scoped(::rayon::ThreadBuilder::run, |pool| pool.install(worker)) +} diff --git a/crates/accelerate/src/two_qubit_decompose.rs b/crates/accelerate/src/two_qubit_decompose.rs index 7dcb273ac163..5e833bd86fda 100644 --- a/crates/accelerate/src/two_qubit_decompose.rs +++ b/crates/accelerate/src/two_qubit_decompose.rs @@ -60,7 +60,7 @@ const TWO_PI: f64 = 2.0 * PI; const C1: c64 = c64 { re: 1.0, im: 0.0 }; -static ONE_QUBIT_IDENTITY: [[Complex64; 2]; 2] = [ +pub static ONE_QUBIT_IDENTITY: [[Complex64; 2]; 2] = [ [Complex64::new(1., 0.), Complex64::new(0., 0.)], [Complex64::new(0., 0.), Complex64::new(1., 0.)], ]; diff --git a/crates/accelerate/src/uc_gate.rs b/crates/accelerate/src/uc_gate.rs new file mode 100644 index 000000000000..3a5f74a6f0b1 --- /dev/null +++ b/crates/accelerate/src/uc_gate.rs @@ -0,0 +1,163 @@ +// This code is part of Qiskit. +// +// (C) Copyright IBM 2024 +// +// This code is licensed under the Apache License, Version 2.0. You may +// obtain a copy of this license in the LICENSE.txt file in the root directory +// of this source tree or at http://www.apache.org/licenses/LICENSE-2.0. +// +// Any modifications or derivative works of this code must retain this +// copyright notice, and modified files need to carry a notice indicating +// that they have been altered from the originals. + +use num_complex::{Complex64, ComplexFloat}; +use pyo3::prelude::*; +use pyo3::wrap_pyfunction; +use pyo3::Python; +use std::f64::consts::{FRAC_1_SQRT_2, PI}; + +use faer_ext::{IntoFaerComplex, IntoNdarrayComplex}; +use ndarray::prelude::*; +use numpy::{IntoPyArray, PyReadonlyArray2}; + +use crate::euler_one_qubit_decomposer::det_one_qubit; + +const PI2: f64 = PI / 2.; +const EPS: f64 = 1e-10; + +// These constants are the non-zero elements of an RZ gate's unitary with an +// angle of pi / 2 +const RZ_PI2_11: Complex64 = Complex64::new(FRAC_1_SQRT_2, -FRAC_1_SQRT_2); +const RZ_PI2_00: Complex64 = Complex64::new(FRAC_1_SQRT_2, FRAC_1_SQRT_2); + +/// This method implements the decomposition given in equation (3) in +/// https://arxiv.org/pdf/quant-ph/0410066.pdf. +/// +/// The decomposition is used recursively to decompose uniformly controlled gates. +/// +/// a,b = single qubit unitaries +/// v,u,r = outcome of the decomposition given in the reference mentioned above +/// +/// (see there for the details). +fn demultiplex_single_uc( + a: ArrayView2, + b: ArrayView2, +) -> [Array2; 3] { + let x = a.dot(&b.mapv(|x| x.conj()).t()); + let det_x = det_one_qubit(x.view()); + let x11 = x[[0, 0]] / det_x.sqrt(); + let phi = det_x.arg(); + + let r1 = (Complex64::new(0., 1.) / 2. * (PI2 - phi / 2. - x11.arg())).exp(); + let r2 = (Complex64::new(0., 1.) / 2. * (PI2 - phi / 2. + x11.arg() + PI)).exp(); + + let r = array![[r1, Complex64::new(0., 0.)], [Complex64::new(0., 0.), r2],]; + + let decomp = r + .dot(&x) + .dot(&r) + .view() + .into_faer_complex() + .complex_eigendecomposition(); + let mut u: Array2 = decomp.u().into_ndarray_complex().to_owned(); + let s = decomp.s().column_vector(); + let mut diag: Array1 = + Array1::from_shape_fn(u.shape()[0], |i| s[i].to_num_complex()); + + // If d is not equal to diag(i,-i), then we put it into this "standard" form + // (see eq. (13) in https://arxiv.org/pdf/quant-ph/0410066.pdf) by interchanging + // the eigenvalues and eigenvectors + if (diag[0] + Complex64::new(0., 1.)).abs() < EPS { + diag = diag.slice(s![..;-1]).to_owned(); + u = u.slice(s![.., ..;-1]).to_owned(); + } + diag.mapv_inplace(|x| x.sqrt()); + let d = Array2::from_diag(&diag); + let v = d + .dot(&u.mapv(|x| x.conj()).t()) + .dot(&r.mapv(|x| x.conj()).t()) + .dot(&b); + [v, u, r] +} + +#[pyfunction] +pub fn dec_ucg_help( + py: Python, + sq_gates: Vec>, + num_qubits: u32, +) -> (Vec, PyObject) { + let mut single_qubit_gates: Vec> = sq_gates + .into_iter() + .map(|x| x.as_array().to_owned()) + .collect(); + let mut diag: Array1 = Array1::ones(2_usize.pow(num_qubits)); + let num_controls = num_qubits - 1; + for dec_step in 0..num_controls { + let num_ucgs = 2_usize.pow(dec_step); + // The decomposition works recursively and the followign loop goes over the different + // UCGates that arise in the decomposition + for ucg_index in 0..num_ucgs { + let len_ucg = 2_usize.pow(num_controls - dec_step); + for i in 0..len_ucg / 2 { + let shift = ucg_index * len_ucg; + let a = single_qubit_gates[shift + i].view(); + let b = single_qubit_gates[shift + len_ucg / 2 + i].view(); + // Apply the decomposition for UCGates given in equation (3) in + // https://arxiv.org/pdf/quant-ph/0410066.pdf + // to demultiplex one control of all the num_ucgs uniformly-controlled gates + // with log2(len_ucg) uniform controls + let [v, u, r] = demultiplex_single_uc(a, b); + // replace the single-qubit gates with v,u (the already existing ones + // are not needed any more) + single_qubit_gates[shift + i] = v; + single_qubit_gates[shift + len_ucg / 2 + i] = u; + // Now we decompose the gates D as described in Figure 4 in + // https://arxiv.org/pdf/quant-ph/0410066.pdf and merge some of the gates + // into the UCGates and the diagonal at the end of the circuit + + // Remark: The Rz(pi/2) rotation acting on the target qubit and the Hadamard + // gates arising in the decomposition of D are ignored for the moment (they will + // be added together with the C-NOT gates at the end of the decomposition + // (in the method dec_ucg())) + let r_conj_t = r.mapv(|x| x.conj()).t().to_owned(); + if ucg_index < num_ucgs - 1 { + // Absorb the Rz(pi/2) rotation on the control into the UC-Rz gate and + // merge the UC-Rz rotation with the following UCGate, + // which hasn't been decomposed yet + let k = shift + len_ucg + i; + + single_qubit_gates[k] = single_qubit_gates[k].dot(&r_conj_t); + single_qubit_gates[k].mapv_inplace(|x| x * RZ_PI2_00); + let k = k + len_ucg / 2; + single_qubit_gates[k] = single_qubit_gates[k].dot(&r); + single_qubit_gates[k].mapv_inplace(|x| x * RZ_PI2_11); + } else { + // Absorb the Rz(pi/2) rotation on the control into the UC-Rz gate and merge + // the trailing UC-Rz rotation into a diagonal gate at the end of the circuit + for ucg_index_2 in 0..num_ucgs { + let shift_2 = ucg_index_2 * len_ucg; + let k = 2 * (i + shift_2); + diag[k] *= r_conj_t[[0, 0]] * RZ_PI2_00; + diag[k + 1] *= r_conj_t[[1, 1]] * RZ_PI2_00; + let k = len_ucg + k; + diag[k] *= r[[0, 0]] * RZ_PI2_11; + diag[k + 1] *= r[[1, 1]] * RZ_PI2_11; + } + } + } + } + } + ( + single_qubit_gates + .into_iter() + .map(|x| x.into_pyarray_bound(py).into()) + .collect(), + diag.into_pyarray_bound(py).into(), + ) +} + +#[pymodule] +pub fn uc_gate(m: &Bound) -> PyResult<()> { + m.add_wrapped(wrap_pyfunction!(dec_ucg_help))?; + Ok(()) +} diff --git a/crates/circuit/src/circuit_data.rs b/crates/circuit/src/circuit_data.rs index 07bab2c17c9b..590fc07e8f8b 100644 --- a/crates/circuit/src/circuit_data.rs +++ b/crates/circuit/src/circuit_data.rs @@ -459,21 +459,14 @@ impl CircuitData { clbits: Option<&Bound>, ) -> PyResult<()> { let mut temp = CircuitData::new(py, qubits, clbits, None, 0)?; - if temp.qubits_native.len() < self.qubits_native.len() { - return Err(PyValueError::new_err(format!( - "Replacement 'qubits' of size {:?} must contain at least {:?} bits.", - temp.qubits_native.len(), - self.qubits_native.len(), - ))); - } - if temp.clbits_native.len() < self.clbits_native.len() { - return Err(PyValueError::new_err(format!( - "Replacement 'clbits' of size {:?} must contain at least {:?} bits.", - temp.clbits_native.len(), - self.clbits_native.len(), - ))); - } if qubits.is_some() { + if temp.qubits_native.len() < self.qubits_native.len() { + return Err(PyValueError::new_err(format!( + "Replacement 'qubits' of size {:?} must contain at least {:?} bits.", + temp.qubits_native.len(), + self.qubits_native.len(), + ))); + } std::mem::swap(&mut temp.qubits, &mut self.qubits); std::mem::swap(&mut temp.qubits_native, &mut self.qubits_native); std::mem::swap( @@ -482,6 +475,13 @@ impl CircuitData { ); } if clbits.is_some() { + if temp.clbits_native.len() < self.clbits_native.len() { + return Err(PyValueError::new_err(format!( + "Replacement 'clbits' of size {:?} must contain at least {:?} bits.", + temp.clbits_native.len(), + self.clbits_native.len(), + ))); + } std::mem::swap(&mut temp.clbits, &mut self.clbits); std::mem::swap(&mut temp.clbits_native, &mut self.clbits_native); std::mem::swap( diff --git a/crates/pyext/src/lib.rs b/crates/pyext/src/lib.rs index e4ec9ac7326d..0cbf39262f87 100644 --- a/crates/pyext/src/lib.rs +++ b/crates/pyext/src/lib.rs @@ -15,11 +15,12 @@ use pyo3::wrap_pymodule; use qiskit_accelerate::{ convert_2q_block_matrix::convert_2q_block_matrix, dense_layout::dense_layout, - error_map::error_map, euler_one_qubit_decomposer::euler_one_qubit_decomposer, nlayout::nlayout, - optimize_1q_gates::optimize_1q_gates, pauli_exp_val::pauli_expval, results::results, - sabre::sabre, sampled_exp_val::sampled_exp_val, sparse_pauli_op::sparse_pauli_op, - stochastic_swap::stochastic_swap, target::target, two_qubit_decompose::two_qubit_decompose, - utils::utils, vf2_layout::vf2_layout, + error_map::error_map, euler_one_qubit_decomposer::euler_one_qubit_decomposer, + isometry::isometry, nlayout::nlayout, optimize_1q_gates::optimize_1q_gates, + pauli_exp_val::pauli_expval, results::results, sabre::sabre, sampled_exp_val::sampled_exp_val, + sparse_pauli_op::sparse_pauli_op, stochastic_swap::stochastic_swap, target::target, + two_qubit_decompose::two_qubit_decompose, uc_gate::uc_gate, utils::utils, + vf2_layout::vf2_layout, }; #[pymodule] @@ -31,6 +32,7 @@ fn _accelerate(m: &Bound) -> PyResult<()> { m.add_wrapped(wrap_pymodule!(dense_layout))?; m.add_wrapped(wrap_pymodule!(error_map))?; m.add_wrapped(wrap_pymodule!(euler_one_qubit_decomposer))?; + m.add_wrapped(wrap_pymodule!(isometry))?; m.add_wrapped(wrap_pymodule!(nlayout))?; m.add_wrapped(wrap_pymodule!(optimize_1q_gates))?; m.add_wrapped(wrap_pymodule!(pauli_expval))?; @@ -41,6 +43,7 @@ fn _accelerate(m: &Bound) -> PyResult<()> { m.add_wrapped(wrap_pymodule!(stochastic_swap))?; m.add_wrapped(wrap_pymodule!(target))?; m.add_wrapped(wrap_pymodule!(two_qubit_decompose))?; + m.add_wrapped(wrap_pymodule!(uc_gate))?; m.add_wrapped(wrap_pymodule!(utils))?; m.add_wrapped(wrap_pymodule!(vf2_layout))?; Ok(()) diff --git a/pyproject.toml b/pyproject.toml index 0d99f9256df3..97ccde21d1b7 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -136,7 +136,7 @@ target-version = ['py38', 'py39', 'py310', 'py311'] [tool.cibuildwheel] manylinux-x86_64-image = "manylinux2014" manylinux-i686-image = "manylinux2014" -skip = "pp* cp36-* cp37-* *musllinux* *win32 *i686" +skip = "pp* cp36-* cp37-* *musllinux* *win32 *i686 cp38-macosx_arm64" test-skip = "*win32 *linux_i686" test-command = "python {project}/examples/python/stochastic_swap.py" # We need to use pre-built versions of Numpy and Scipy in the tests; they have a @@ -212,6 +212,7 @@ disable = [ "no-else-return", # relax "elif" after a clause with a return "docstring-first-line-empty", # relax docstring style "import-outside-toplevel", "import-error", # overzealous with our optionals/dynamic packages + "nested-min-max", # this gives false equivalencies if implemented for the current lint version # TODO(#9614): these were added in modern Pylint. Decide if we want to enable them. If so, # remove from here and fix the issues. Else, move it above this section and add a comment # with the rationale @@ -221,19 +222,15 @@ disable = [ "consider-using-dict-items", "consider-using-enumerate", "consider-using-f-string", - "modified-iterating-list", - "nested-min-max", "no-member", "no-value-for-parameter", "not-context-manager", - "superfluous-parens", "unexpected-keyword-arg", "unnecessary-dict-index-lookup", "unnecessary-dunder-call", "unnecessary-lambda-assignment", "unspecified-encoding", "unsupported-assignment-operation", - "use-dict-literal", "use-implicit-booleaness-not-comparison", ] diff --git a/qiskit/__init__.py b/qiskit/__init__.py index 355d63e89418..991d4eaafa26 100644 --- a/qiskit/__init__.py +++ b/qiskit/__init__.py @@ -53,6 +53,7 @@ import qiskit._accelerate +import qiskit._numpy_compat # Globally define compiled submodules. The normal import mechanism will not find compiled submodules # in _accelerate because it relies on file paths, but PyO3 generates only one shared library file. @@ -64,6 +65,8 @@ ) sys.modules["qiskit._accelerate.dense_layout"] = qiskit._accelerate.dense_layout sys.modules["qiskit._accelerate.error_map"] = qiskit._accelerate.error_map +sys.modules["qiskit._accelerate.isometry"] = qiskit._accelerate.isometry +sys.modules["qiskit._accelerate.uc_gate"] = qiskit._accelerate.uc_gate sys.modules["qiskit._accelerate.euler_one_qubit_decomposer"] = ( qiskit._accelerate.euler_one_qubit_decomposer ) diff --git a/qiskit/_numpy_compat.py b/qiskit/_numpy_compat.py new file mode 100644 index 000000000000..a6c06671c986 --- /dev/null +++ b/qiskit/_numpy_compat.py @@ -0,0 +1,73 @@ +# This code is part of Qiskit. +# +# (C) Copyright IBM 2024. +# +# This code is licensed under the Apache License, Version 2.0. You may +# obtain a copy of this license in the LICENSE.txt file in the root directory +# of this source tree or at http://www.apache.org/licenses/LICENSE-2.0. +# +# Any modifications or derivative works of this code must retain this +# copyright notice, and modified files need to carry a notice indicating +# that they have been altered from the originals. + +"""Compatiblity helpers for the Numpy 1.x to 2.0 transition.""" + +import re +import typing +import warnings + +import numpy as np + +# This version pattern is taken from the pypa packaging project: +# https://github.com/pypa/packaging/blob/21.3/packaging/version.py#L223-L254 which is dual licensed +# Apache 2.0 and BSD see the source for the original authors and other details. +_VERSION_PATTERN = r""" + v? + (?: + (?:(?P[0-9]+)!)? # epoch + (?P[0-9]+(?:\.[0-9]+)*) # release segment + (?P
                                          # pre-release
+            [-_\.]?
+            (?P(a|b|c|rc|alpha|beta|pre|preview))
+            [-_\.]?
+            (?P[0-9]+)?
+        )?
+        (?P                                         # post release
+            (?:-(?P[0-9]+))
+            |
+            (?:
+                [-_\.]?
+                (?Ppost|rev|r)
+                [-_\.]?
+                (?P[0-9]+)?
+            )
+        )?
+        (?P                                          # dev release
+            [-_\.]?
+            (?Pdev)
+            [-_\.]?
+            (?P[0-9]+)?
+        )?
+    )
+    (?:\+(?P[a-z0-9]+(?:[-_\.][a-z0-9]+)*))?       # local version
+"""
+
+VERSION = np.lib.NumpyVersion(np.__version__)
+VERSION_PARTS: typing.Tuple[int, ...]
+"""The numeric parts of the Numpy release version, e.g. ``(2, 0, 0)``.  Does not include pre- or
+post-release markers (e.g. ``rc1``)."""
+if match := re.fullmatch(_VERSION_PATTERN, np.__version__, flags=re.VERBOSE | re.IGNORECASE):
+    # Assuming Numpy won't ever introduce epochs, and we don't care about pre/post markers.
+    VERSION_PARTS = tuple(int(x) for x in match["release"].split("."))
+else:
+    # Just guess a version.  We know all existing Numpys have good version strings, so the only way
+    # this should trigger is from a new or a dev version.
+    warnings.warn(
+        f"Unrecognized version string for Numpy: '{np.__version__}'.  Assuming Numpy 2.0.",
+        RuntimeWarning,
+    )
+    VERSION_PARTS = (2, 0, 0)
+
+COPY_ONLY_IF_NEEDED = None if VERSION_PARTS >= (2, 0, 0) else False
+"""The sentinel value given to ``np.array`` and ``np.ndarray.astype`` (etc) to indicate that a copy
+should be made only if required."""
diff --git a/qiskit/assembler/disassemble.py b/qiskit/assembler/disassemble.py
index e777f4050dda..c94b108c4b25 100644
--- a/qiskit/assembler/disassemble.py
+++ b/qiskit/assembler/disassemble.py
@@ -231,13 +231,12 @@ def _experiments_to_circuits(qobj):
         pulse_lib = qobj.config.pulse_library if hasattr(qobj.config, "pulse_library") else []
         # The dict update method did not work here; could investigate in the future
         if hasattr(qobj.config, "calibrations"):
-            circuit.calibrations = dict(
-                **circuit.calibrations, **_qobj_to_circuit_cals(qobj, pulse_lib)
-            )
+            circuit.calibrations = {
+                **circuit.calibrations,
+                **_qobj_to_circuit_cals(qobj, pulse_lib),
+            }
         if hasattr(exp.config, "calibrations"):
-            circuit.calibrations = dict(
-                **circuit.calibrations, **_qobj_to_circuit_cals(exp, pulse_lib)
-            )
+            circuit.calibrations = {**circuit.calibrations, **_qobj_to_circuit_cals(exp, pulse_lib)}
         circuits.append(circuit)
     return circuits
 
diff --git a/qiskit/circuit/__init__.py b/qiskit/circuit/__init__.py
index 2e8603af9f7d..9fbefb4c5d9f 100644
--- a/qiskit/circuit/__init__.py
+++ b/qiskit/circuit/__init__.py
@@ -833,7 +833,7 @@
 ``__array__``.  This is used by :meth:`Gate.to_matrix`, and has the signature:
 
 .. currentmodule:: None
-.. py:method:: __array__(dtype=None)
+.. py:method:: __array__(dtype=None, copy=None)
 
     Return a Numpy array representing the gate.  This can use the gate's :attr:`~Instruction.params`
     field, and may assume that these are numeric values (assuming the subclass expects that) and not
@@ -875,7 +875,9 @@ def power(self, exponent: float):
             # Also we have an efficient representation of power.
             return RXZGate(exponent * self.params[0])
 
-        def __array__(self, dtype=None):
+        def __array__(self, dtype=None, copy=None):
+            if copy is False:
+                raise ValueError("unable to avoid copy while creating an array as requested")
             cos = math.cos(0.5 * self.params[0])
             isin = 1j * math.sin(0.5 * self.params[0])
             return np.array([
@@ -1340,6 +1342,7 @@ def __array__(self, dtype=None):
 """
 
 from .exceptions import CircuitError
+from . import _utils
 from .quantumcircuit import QuantumCircuit
 from .classicalregister import ClassicalRegister, Clbit
 from .quantumregister import QuantumRegister, Qubit, AncillaRegister, AncillaQubit
diff --git a/qiskit/circuit/_classical_resource_map.py b/qiskit/circuit/_classical_resource_map.py
index cfbdd077bda4..454826d6035d 100644
--- a/qiskit/circuit/_classical_resource_map.py
+++ b/qiskit/circuit/_classical_resource_map.py
@@ -37,17 +37,20 @@ class VariableMapper(expr.ExprVisitor[expr.Expr]):
     ``ValueError`` will be raised instead.  The given ``add_register`` callable may choose to raise
     its own exception."""
 
-    __slots__ = ("target_cregs", "register_map", "bit_map", "add_register")
+    __slots__ = ("target_cregs", "register_map", "bit_map", "var_map", "add_register")
 
     def __init__(
         self,
         target_cregs: typing.Iterable[ClassicalRegister],
         bit_map: typing.Mapping[Bit, Bit],
+        var_map: typing.Mapping[expr.Var, expr.Var] | None = None,
+        *,
         add_register: typing.Callable[[ClassicalRegister], None] | None = None,
     ):
         self.target_cregs = tuple(target_cregs)
         self.register_map = {}
         self.bit_map = bit_map
+        self.var_map = var_map or {}
         self.add_register = add_register
 
     def _map_register(self, theirs: ClassicalRegister) -> ClassicalRegister:
@@ -127,9 +130,7 @@ def visit_var(self, node, /):
             return expr.Var(self.bit_map[node.var], node.type)
         if isinstance(node.var, ClassicalRegister):
             return expr.Var(self._map_register(node.var), node.type)
-        # Defensive against the expansion of the variable system; we don't want to silently do the
-        # wrong thing (which would be `return node` without mapping, right now).
-        raise RuntimeError(f"unhandled variable in 'compose': {node}")  # pragma: no cover
+        return self.var_map.get(node, node)
 
     def visit_value(self, node, /):
         return expr.Value(node.value, node.type)
diff --git a/qiskit/circuit/_utils.py b/qiskit/circuit/_utils.py
index cfde85bad8dd..86a058e88525 100644
--- a/qiskit/circuit/_utils.py
+++ b/qiskit/circuit/_utils.py
@@ -15,6 +15,8 @@
 
 import math
 import numpy
+
+from qiskit import _numpy_compat
 from qiskit.exceptions import QiskitError
 from qiskit.circuit.exceptions import CircuitError
 from .parametervector import ParameterVectorElement
@@ -117,8 +119,9 @@ def with_gate_array(base_array):
     nonwritable = numpy.array(base_array, dtype=numpy.complex128)
     nonwritable.setflags(write=False)
 
-    def __array__(_self, dtype=None):
-        return numpy.asarray(nonwritable, dtype=dtype)
+    def __array__(_self, dtype=None, copy=_numpy_compat.COPY_ONLY_IF_NEEDED):
+        dtype = nonwritable.dtype if dtype is None else dtype
+        return numpy.array(nonwritable, dtype=dtype, copy=copy)
 
     def decorator(cls):
         if hasattr(cls, "__array__"):
@@ -149,15 +152,21 @@ def matrix_for_control_state(state):
     if cached_states is None:
         nonwritables = [matrix_for_control_state(state) for state in range(2**num_ctrl_qubits)]
 
-        def __array__(self, dtype=None):
-            return numpy.asarray(nonwritables[self.ctrl_state], dtype=dtype)
+        def __array__(self, dtype=None, copy=_numpy_compat.COPY_ONLY_IF_NEEDED):
+            arr = nonwritables[self.ctrl_state]
+            dtype = arr.dtype if dtype is None else dtype
+            return numpy.array(arr, dtype=dtype, copy=copy)
 
     else:
         nonwritables = {state: matrix_for_control_state(state) for state in cached_states}
 
-        def __array__(self, dtype=None):
-            if (out := nonwritables.get(self.ctrl_state)) is not None:
-                return numpy.asarray(out, dtype=dtype)
+        def __array__(self, dtype=None, copy=_numpy_compat.COPY_ONLY_IF_NEEDED):
+            if (arr := nonwritables.get(self.ctrl_state)) is not None:
+                dtype = arr.dtype if dtype is None else dtype
+                return numpy.array(arr, dtype=dtype, copy=copy)
+
+            if copy is False and copy is not _numpy_compat.COPY_ONLY_IF_NEEDED:
+                raise ValueError("could not produce matrix without calculation")
             return numpy.asarray(
                 _compute_control_matrix(base, num_ctrl_qubits, self.ctrl_state), dtype=dtype
             )
diff --git a/qiskit/circuit/controlflow/control_flow.py b/qiskit/circuit/controlflow/control_flow.py
index 51b3709db6b5..2085f760ebcd 100644
--- a/qiskit/circuit/controlflow/control_flow.py
+++ b/qiskit/circuit/controlflow/control_flow.py
@@ -22,6 +22,7 @@
 
 if typing.TYPE_CHECKING:
     from qiskit.circuit import QuantumCircuit
+    from qiskit.circuit.classical import expr
 
 
 class ControlFlowOp(Instruction, ABC):
@@ -72,3 +73,12 @@ def map_block(block: QuantumCircuit) -> QuantumCircuit:
         Returns:
             New :class:`ControlFlowOp` with replaced blocks.
         """
+
+    def iter_captured_vars(self) -> typing.Iterable[expr.Var]:
+        """Get an iterator over the unique captured variables in all blocks of this construct."""
+        seen = set()
+        for block in self.blocks:
+            for var in block.iter_captured_vars():
+                if var not in seen:
+                    seen.add(var)
+                    yield var
diff --git a/qiskit/circuit/delay.py b/qiskit/circuit/delay.py
index 16d84d15cbe5..a333125a5a2b 100644
--- a/qiskit/circuit/delay.py
+++ b/qiskit/circuit/delay.py
@@ -17,9 +17,11 @@
 from qiskit.circuit.exceptions import CircuitError
 from qiskit.circuit.instruction import Instruction
 from qiskit.circuit.gate import Gate
+from qiskit.circuit import _utils
 from qiskit.circuit.parameterexpression import ParameterExpression
 
 
+@_utils.with_gate_array(np.eye(2, dtype=complex))
 class Delay(Instruction):
     """Do nothing and just delay/wait/idle for a specified duration."""
 
@@ -53,10 +55,6 @@ def duration(self, duration):
         """Set the duration of this delay."""
         self.params = [duration]
 
-    def __array__(self, dtype=None):
-        """Return the identity matrix."""
-        return np.array([[1, 0], [0, 1]], dtype=dtype)
-
     def to_matrix(self) -> np.ndarray:
         """Return a Numpy.array for the unitary matrix. This has been
         added to enable simulation without making delay a full Gate type.
diff --git a/qiskit/circuit/library/blueprintcircuit.py b/qiskit/circuit/library/blueprintcircuit.py
index 3d1f5c77f44f..2bbd5ca5650a 100644
--- a/qiskit/circuit/library/blueprintcircuit.py
+++ b/qiskit/circuit/library/blueprintcircuit.py
@@ -128,11 +128,31 @@ def _append(self, instruction, _qargs=None, _cargs=None):
         return super()._append(instruction, _qargs, _cargs)
 
     def compose(
-        self, other, qubits=None, clbits=None, front=False, inplace=False, wrap=False, *, copy=True
+        self,
+        other,
+        qubits=None,
+        clbits=None,
+        front=False,
+        inplace=False,
+        wrap=False,
+        *,
+        copy=True,
+        var_remap=None,
+        inline_captures=False,
     ):
         if not self._is_built:
             self._build()
-        return super().compose(other, qubits, clbits, front, inplace, wrap, copy=copy)
+        return super().compose(
+            other,
+            qubits,
+            clbits,
+            front,
+            inplace,
+            wrap,
+            copy=copy,
+            var_remap=var_remap,
+            inline_captures=False,
+        )
 
     def inverse(self, annotated: bool = False):
         if not self._is_built:
@@ -180,10 +200,10 @@ def num_connected_components(self, unitary_only=False):
             self._build()
         return super().num_connected_components(unitary_only=unitary_only)
 
-    def copy_empty_like(self, name=None):
+    def copy_empty_like(self, name=None, *, vars_mode="alike"):
         if not self._is_built:
             self._build()
-        cpy = super().copy_empty_like(name=name)
+        cpy = super().copy_empty_like(name=name, vars_mode=vars_mode)
         # The base `copy_empty_like` will typically trigger code that `BlueprintCircuit` treats as
         # an "invalidation", so we have to manually restore properties deleted by that that
         # `copy_empty_like` is supposed to propagate.
diff --git a/qiskit/circuit/library/generalized_gates/isometry.py b/qiskit/circuit/library/generalized_gates/isometry.py
index 1294feb26342..c180e7a14484 100644
--- a/qiskit/circuit/library/generalized_gates/isometry.py
+++ b/qiskit/circuit/library/generalized_gates/isometry.py
@@ -21,7 +21,6 @@
 
 from __future__ import annotations
 
-import itertools
 import math
 import numpy as np
 from qiskit.circuit.exceptions import CircuitError
@@ -30,6 +29,7 @@
 from qiskit.circuit.quantumregister import QuantumRegister
 from qiskit.exceptions import QiskitError
 from qiskit.quantum_info.operators.predicates import is_isometry
+from qiskit._accelerate import isometry as isometry_rs
 
 from .diagonal import Diagonal
 from .uc import UCGate
@@ -157,12 +157,16 @@ def _gates_to_uncompute(self):
         # correspond to the firstfew columns of the identity matrix up to diag, and hence we only
         # have to save a list containing them.
         for column_index in range(2**m):
-            self._decompose_column(circuit, q, diag, remaining_isometry, column_index)
+            remaining_isometry, diag = self._decompose_column(
+                circuit, q, diag, remaining_isometry, column_index
+            )
             # extract phase of the state that was sent to the basis state ket(column_index)
             diag.append(remaining_isometry[column_index, 0])
             # remove first column (which is now stored in diag)
             remaining_isometry = remaining_isometry[:, 1:]
-        if len(diag) > 1 and not _diag_is_identity_up_to_global_phase(diag, self._epsilon):
+        if len(diag) > 1 and not isometry_rs.diag_is_identity_up_to_global_phase(
+            diag, self._epsilon
+        ):
             diagonal = Diagonal(np.conj(diag))
             circuit.append(diagonal, q_input)
         return circuit
@@ -173,7 +177,10 @@ def _decompose_column(self, circuit, q, diag, remaining_isometry, column_index):
         """
         n = int(math.log2(self.iso_data.shape[0]))
         for s in range(n):
-            self._disentangle(circuit, q, diag, remaining_isometry, column_index, s)
+            remaining_isometry, diag = self._disentangle(
+                circuit, q, diag, remaining_isometry, column_index, s
+            )
+        return remaining_isometry, diag
 
     def _disentangle(self, circuit, q, diag, remaining_isometry, column_index, s):
         """
@@ -189,13 +196,19 @@ def _disentangle(self, circuit, q, diag, remaining_isometry, column_index, s):
         n = int(math.log2(self.iso_data.shape[0]))
 
         # MCG to set one entry to zero (preparation for disentangling with UCGate):
-        index1 = 2 * _a(k, s + 1) * 2**s + _b(k, s + 1)
-        index2 = (2 * _a(k, s + 1) + 1) * 2**s + _b(k, s + 1)
+        index1 = 2 * isometry_rs.a(k, s + 1) * 2**s + isometry_rs.b(k, s + 1)
+        index2 = (2 * isometry_rs.a(k, s + 1) + 1) * 2**s + isometry_rs.b(k, s + 1)
         target_label = n - s - 1
         # Check if a MCG is required
-        if _k_s(k, s) == 0 and _b(k, s + 1) != 0 and np.abs(v[index2, k_prime]) > self._epsilon:
+        if (
+            isometry_rs.k_s(k, s) == 0
+            and isometry_rs.b(k, s + 1) != 0
+            and np.abs(v[index2, k_prime]) > self._epsilon
+        ):
             # Find the MCG, decompose it and apply it to the remaining isometry
-            gate = _reverse_qubit_state([v[index1, k_prime], v[index2, k_prime]], 0, self._epsilon)
+            gate = isometry_rs.reverse_qubit_state(
+                [v[index1, k_prime], v[index2, k_prime]], 0, self._epsilon
+            )
             control_labels = [
                 i
                 for i, x in enumerate(_get_binary_rep_as_list(k, n))
@@ -205,57 +218,49 @@ def _disentangle(self, circuit, q, diag, remaining_isometry, column_index, s):
                 circuit, q, gate, control_labels, target_label
             )
             # apply the MCG to the remaining isometry
-            _apply_multi_controlled_gate(v, control_labels, target_label, gate)
+            v = isometry_rs.apply_multi_controlled_gate(v, control_labels, target_label, gate)
             # correct for the implementation "up to diagonal"
-            diag_mcg_inverse = np.conj(diagonal_mcg).tolist()
-            _apply_diagonal_gate(v, control_labels + [target_label], diag_mcg_inverse)
+            diag_mcg_inverse = np.conj(diagonal_mcg).astype(complex, copy=False)
+            v = isometry_rs.apply_diagonal_gate(
+                v, control_labels + [target_label], diag_mcg_inverse
+            )
             # update the diag according to the applied diagonal gate
-            _apply_diagonal_gate_to_diag(diag, control_labels + [target_label], diag_mcg_inverse, n)
+            diag = isometry_rs.apply_diagonal_gate_to_diag(
+                diag, control_labels + [target_label], diag_mcg_inverse, n
+            )
 
         # UCGate to disentangle a qubit:
         # Find the UCGate, decompose it and apply it to the remaining isometry
         single_qubit_gates = self._find_squs_for_disentangling(v, k, s)
-        if not _ucg_is_identity_up_to_global_phase(single_qubit_gates, self._epsilon):
+        if not isometry_rs.ucg_is_identity_up_to_global_phase(single_qubit_gates, self._epsilon):
             control_labels = list(range(target_label))
             diagonal_ucg = self._append_ucg_up_to_diagonal(
                 circuit, q, single_qubit_gates, control_labels, target_label
             )
             # merge the diagonal into the UCGate for efficient application of both together
-            diagonal_ucg_inverse = np.conj(diagonal_ucg).tolist()
-            single_qubit_gates = _merge_UCGate_and_diag(single_qubit_gates, diagonal_ucg_inverse)
+            diagonal_ucg_inverse = np.conj(diagonal_ucg).astype(complex, copy=False)
+            single_qubit_gates = isometry_rs.merge_ucgate_and_diag(
+                single_qubit_gates, diagonal_ucg_inverse
+            )
             # apply the UCGate (with the merged diagonal gate) to the remaining isometry
-            _apply_ucg(v, len(control_labels), single_qubit_gates)
+            v = isometry_rs.apply_ucg(v, len(control_labels), single_qubit_gates)
             # update the diag according to the applied diagonal gate
-            _apply_diagonal_gate_to_diag(
+            diag = isometry_rs.apply_diagonal_gate_to_diag(
                 diag, control_labels + [target_label], diagonal_ucg_inverse, n
             )
             # # correct for the implementation "up to diagonal"
             # diag_inv = np.conj(diag).tolist()
             # _apply_diagonal_gate(v, control_labels + [target_label], diag_inv)
+        return v, diag
 
     # This method finds the single-qubit gates for a UCGate to disentangle a qubit:
     # we consider the n-qubit state v[:,0] starting with k zeros (in the computational basis).
     # The qubit with label n-s-1 is disentangled into the basis state k_s(k,s).
     def _find_squs_for_disentangling(self, v, k, s):
-        k_prime = 0
-        n = int(math.log2(self.iso_data.shape[0]))
-        if _b(k, s + 1) == 0:
-            i_start = _a(k, s + 1)
-        else:
-            i_start = _a(k, s + 1) + 1
-        id_list = [np.eye(2, 2) for _ in range(i_start)]
-        squs = [
-            _reverse_qubit_state(
-                [
-                    v[2 * i * 2**s + _b(k, s), k_prime],
-                    v[(2 * i + 1) * 2**s + _b(k, s), k_prime],
-                ],
-                _k_s(k, s),
-                self._epsilon,
-            )
-            for i in range(i_start, 2 ** (n - s - 1))
-        ]
-        return id_list + squs
+        res = isometry_rs.find_squs_for_disentangling(
+            v, k, s, self._epsilon, n=int(math.log2(self.iso_data.shape[0]))
+        )
+        return res
 
     # Append a UCGate up to diagonal to the circuit circ.
     def _append_ucg_up_to_diagonal(self, circ, q, single_qubit_gates, control_labels, target_label):
@@ -338,146 +343,6 @@ def inv_gate(self):
         return self._inverse
 
 
-# Find special unitary matrix that maps [c0,c1] to [r,0] or [0,r] if basis_state=0 or
-# basis_state=1 respectively
-def _reverse_qubit_state(state, basis_state, epsilon):
-    state = np.array(state)
-    r = np.linalg.norm(state)
-    if r < epsilon:
-        return np.eye(2, 2)
-    if basis_state == 0:
-        m = np.array([[np.conj(state[0]), np.conj(state[1])], [-state[1], state[0]]]) / r
-    else:
-        m = np.array([[-state[1], state[0]], [np.conj(state[0]), np.conj(state[1])]]) / r
-    return m
-
-
-# Methods for applying gates to matrices (should be moved to Qiskit AER)
-
-# Input: matrix m with 2^n rows (and arbitrary many columns). Think of the columns as states
-#  on n qubits. The method applies a uniformly controlled gate (UCGate) to all the columns, where
-#  the UCGate is specified by the inputs k and single_qubit_gates:
-
-#  k =  number of controls. We assume that the controls are on the k most significant qubits
-#       (and the target is on the (k+1)th significant qubit)
-#  single_qubit_gates =     [u_0,...,u_{2^k-1}], where the u_i's are 2*2 unitaries
-#                           (provided as numpy arrays)
-
-# The order of the single-qubit unitaries is such that the first unitary u_0 is applied to the
-# (k+1)th significant qubit if the control qubits are in the state ket(0...00), the gate u_1 is
-# applied if the control qubits are in the state ket(0...01), and so on.
-
-# The input matrix m and the single-qubit gates have to be of dtype=complex.
-
-
-def _apply_ucg(m, k, single_qubit_gates):
-    # ToDo: Improve efficiency by parallelizing the gate application. A generalized version of
-    # ToDo: this method should be implemented by the state vector simulator in Qiskit AER.
-    num_qubits = int(math.log2(m.shape[0]))
-    num_col = m.shape[1]
-    spacing = 2 ** (num_qubits - k - 1)
-    for j in range(2 ** (num_qubits - 1)):
-        i = (j // spacing) * spacing + j
-        gate_index = i // (2 ** (num_qubits - k))
-        for col in range(num_col):
-            m[np.array([i, i + spacing]), np.array([col, col])] = np.ndarray.flatten(
-                single_qubit_gates[gate_index].dot(np.array([[m[i, col]], [m[i + spacing, col]]]))
-            ).tolist()
-    return m
-
-
-# Apply a diagonal gate with diagonal entries liste in diag and acting on qubits with labels
-#  action_qubit_labels to a matrix m.
-# The input matrix m has to be of dtype=complex
-# The qubit labels are such that label 0 corresponds to the most significant qubit, label 1 to
-#  the second most significant qubit, and so on ...
-
-
-def _apply_diagonal_gate(m, action_qubit_labels, diag):
-    # ToDo: Improve efficiency by parallelizing the gate application. A generalized version of
-    # ToDo: this method should be implemented by the state vector simulator in Qiskit AER.
-    num_qubits = int(math.log2(m.shape[0]))
-    num_cols = m.shape[1]
-    basis_states = list(itertools.product([0, 1], repeat=num_qubits))
-    for state in basis_states:
-        state_on_action_qubits = [state[i] for i in action_qubit_labels]
-        diag_index = _bin_to_int(state_on_action_qubits)
-        i = _bin_to_int(state)
-        for j in range(num_cols):
-            m[i, j] = diag[diag_index] * m[i, j]
-    return m
-
-
-# Special case of the method _apply_diagonal_gate, where the input m is a diagonal matrix on the
-# log2(len(m_diagonal)) least significant qubits (this method is more efficient in this case
-# than _apply_diagonal_gate). The input m_diagonal is provided as a list of diagonal entries.
-# The diagonal diag is applied on the qubits with labels listed in action_qubit_labels. The input
-# num_qubits gives the total number of considered qubits (this input is required to interpret the
-# action_qubit_labels in relation to the least significant qubits).
-
-
-def _apply_diagonal_gate_to_diag(m_diagonal, action_qubit_labels, diag, num_qubits):
-    if not m_diagonal:
-        return m_diagonal
-    basis_states = list(itertools.product([0, 1], repeat=num_qubits))
-    for state in basis_states[: len(m_diagonal)]:
-        state_on_action_qubits = [state[i] for i in action_qubit_labels]
-        diag_index = _bin_to_int(state_on_action_qubits)
-        i = _bin_to_int(state)
-        m_diagonal[i] *= diag[diag_index]
-    return m_diagonal
-
-
-# Apply a MC single-qubit gate (given by the 2*2 unitary input: gate) with controlling on
-# the qubits with label control_labels and acting on the qubit with label target_label
-# to a matrix m. The input matrix m and the gate have to be of dtype=complex. The qubit labels are
-# such that label 0 corresponds to the most significant qubit, label 1 to the second most
-# significant qubit, and so on ...
-
-
-def _apply_multi_controlled_gate(m, control_labels, target_label, gate):
-    # ToDo: This method should be integrated into the state vector simulator in Qiskit AER.
-    num_qubits = int(math.log2(m.shape[0]))
-    num_cols = m.shape[1]
-    control_labels.sort()
-    free_qubits = num_qubits - len(control_labels) - 1
-    basis_states_free = list(itertools.product([0, 1], repeat=free_qubits))
-    for state_free in basis_states_free:
-        (e1, e2) = _construct_basis_states(state_free, control_labels, target_label)
-        for i in range(num_cols):
-            m[np.array([e1, e2]), np.array([i, i])] = np.ndarray.flatten(
-                gate.dot(np.array([[m[e1, i]], [m[e2, i]]]))
-            ).tolist()
-    return m
-
-
-# Helper method for _apply_multi_controlled_gate. This constructs the basis states the MG gate
-# is acting on for a specific state state_free of the qubits we neither control nor act on.
-
-
-def _construct_basis_states(state_free, control_labels, target_label):
-    e1 = []
-    e2 = []
-    j = 0
-    for i in range(len(state_free) + len(control_labels) + 1):
-        if i in control_labels:
-            e1.append(1)
-            e2.append(1)
-        elif i == target_label:
-            e1.append(0)
-            e2.append(1)
-        else:
-            e1.append(state_free[j])
-            e2.append(state_free[j])
-            j += 1
-    out1 = _bin_to_int(e1)
-    out2 = _bin_to_int(e2)
-    return out1, out2
-
-
-# Some helper methods:
-
-
 # Get the qubits in the list qubits corresponding to the labels listed in labels. The total number
 # of qubits is given by num_qubits (and determines the convention for the qubit labeling)
 
@@ -496,14 +361,6 @@ def _reverse_qubit_oder(qubits):
 # Convert list of binary digits to integer
 
 
-def _bin_to_int(binary_digits_as_list):
-    return int("".join(str(x) for x in binary_digits_as_list), 2)
-
-
-def _ct(m):
-    return np.transpose(np.conjugate(m))
-
-
 def _get_binary_rep_as_list(n, num_digits):
     binary_string = np.binary_repr(n).zfill(num_digits)
     binary = []
@@ -511,64 +368,3 @@ def _get_binary_rep_as_list(n, num_digits):
         for c in line:
             binary.append(int(c))
     return binary[-num_digits:]
-
-
-# absorb a diagonal gate into a UCGate
-
-
-def _merge_UCGate_and_diag(single_qubit_gates, diag):
-    for i, gate in enumerate(single_qubit_gates):
-        single_qubit_gates[i] = np.array([[diag[2 * i], 0.0], [0.0, diag[2 * i + 1]]]).dot(gate)
-    return single_qubit_gates
-
-
-# Helper variables/functions for the column-by-column decomposition
-
-
-# a(k,s) and b(k,s) are positive integers such that k = a(k,s)2^s + b(k,s)
-# (with the maximal choice of a(k,s))
-
-
-def _a(k, s):
-    return k // 2**s
-
-
-def _b(k, s):
-    return k - (_a(k, s) * 2**s)
-
-
-# given a binary representation of k with binary digits [k_{n-1},..,k_1,k_0],
-# the method k_s(k, s) returns k_s
-
-
-def _k_s(k, s):
-    if k == 0:
-        return 0
-    else:
-        num_digits = s + 1
-        return _get_binary_rep_as_list(k, num_digits)[0]
-
-
-# Check if a gate of a special form is equal to the identity gate up to global phase
-
-
-def _ucg_is_identity_up_to_global_phase(single_qubit_gates, epsilon):
-    if not np.abs(single_qubit_gates[0][0, 0]) < epsilon:
-        global_phase = 1.0 / (single_qubit_gates[0][0, 0])
-    else:
-        return False
-    for gate in single_qubit_gates:
-        if not np.allclose(global_phase * gate, np.eye(2, 2)):
-            return False
-    return True
-
-
-def _diag_is_identity_up_to_global_phase(diag, epsilon):
-    if not np.abs(diag[0]) < epsilon:
-        global_phase = 1.0 / (diag[0])
-    else:
-        return False
-    for d in diag:
-        if not np.abs(global_phase * d - 1) < epsilon:
-            return False
-    return True
diff --git a/qiskit/circuit/library/generalized_gates/pauli.py b/qiskit/circuit/library/generalized_gates/pauli.py
index e8b063a75e9e..01bbd09c1979 100644
--- a/qiskit/circuit/library/generalized_gates/pauli.py
+++ b/qiskit/circuit/library/generalized_gates/pauli.py
@@ -63,13 +63,13 @@ def inverse(self, annotated: bool = False):
         r"""Return inverted pauli gate (itself)."""
         return PauliGate(self.params[0])  # self-inverse
 
-    def __array__(self, dtype=None):
+    def __array__(self, dtype=None, copy=None):
         """Return a Numpy.array for the pauli gate.
         i.e. tensor product of the paulis"""
         # pylint: disable=cyclic-import
         from qiskit.quantum_info.operators import Pauli
 
-        return Pauli(self.params[0]).__array__(dtype=dtype)
+        return Pauli(self.params[0]).__array__(dtype=dtype, copy=copy)
 
     def validate_parameter(self, parameter):
         if isinstance(parameter, str):
diff --git a/qiskit/circuit/library/generalized_gates/permutation.py b/qiskit/circuit/library/generalized_gates/permutation.py
index 8888344c78bc..776c69d94f01 100644
--- a/qiskit/circuit/library/generalized_gates/permutation.py
+++ b/qiskit/circuit/library/generalized_gates/permutation.py
@@ -147,8 +147,11 @@ def __init__(
 
         super().__init__(name="permutation", num_qubits=num_qubits, params=[pattern])
 
-    def __array__(self, dtype=None):
+    def __array__(self, dtype=None, copy=None):
         """Return a numpy.array for the Permutation gate."""
+        if copy is False:
+            raise ValueError("unable to avoid copy while creating an array as requested")
+
         nq = len(self.pattern)
         mat = np.zeros((2**nq, 2**nq), dtype=dtype)
 
diff --git a/qiskit/circuit/library/generalized_gates/uc.py b/qiskit/circuit/library/generalized_gates/uc.py
index 2d650e98466a..f54567123e02 100644
--- a/qiskit/circuit/library/generalized_gates/uc.py
+++ b/qiskit/circuit/library/generalized_gates/uc.py
@@ -21,7 +21,6 @@
 
 from __future__ import annotations
 
-import cmath
 import math
 
 import numpy as np
@@ -33,14 +32,11 @@
 from qiskit.circuit.quantumcircuit import QuantumCircuit
 from qiskit.circuit.exceptions import CircuitError
 from qiskit.exceptions import QiskitError
-
-# pylint: disable=cyclic-import
-from qiskit.synthesis.one_qubit.one_qubit_decompose import OneQubitEulerDecomposer
+from qiskit._accelerate import uc_gate
 
 from .diagonal import Diagonal
 
 _EPS = 1e-10  # global variable used to chop very small numbers to zero
-_DECOMPOSER1Q = OneQubitEulerDecomposer("U3")
 
 
 class UCGate(Gate):
@@ -203,99 +199,7 @@ def _dec_ucg_help(self):
         https://arxiv.org/pdf/quant-ph/0410066.pdf.
         """
         single_qubit_gates = [gate.astype(complex) for gate in self.params]
-        diag = np.ones(2**self.num_qubits, dtype=complex)
-        num_contr = self.num_qubits - 1
-        for dec_step in range(num_contr):
-            num_ucgs = 2**dec_step
-            # The decomposition works recursively and the following loop goes over the different
-            # UCGates that arise in the decomposition
-            for ucg_index in range(num_ucgs):
-                len_ucg = 2 ** (num_contr - dec_step)
-                for i in range(int(len_ucg / 2)):
-                    shift = ucg_index * len_ucg
-                    a = single_qubit_gates[shift + i]
-                    b = single_qubit_gates[shift + len_ucg // 2 + i]
-                    # Apply the decomposition for UCGates given in equation (3) in
-                    # https://arxiv.org/pdf/quant-ph/0410066.pdf
-                    # to demultiplex one control of all the num_ucgs uniformly-controlled gates
-                    #  with log2(len_ucg) uniform controls
-                    v, u, r = self._demultiplex_single_uc(a, b)
-                    #  replace the single-qubit gates with v,u (the already existing ones
-                    #  are not needed any more)
-                    single_qubit_gates[shift + i] = v
-                    single_qubit_gates[shift + len_ucg // 2 + i] = u
-                    # Now we decompose the gates D as described in Figure 4  in
-                    # https://arxiv.org/pdf/quant-ph/0410066.pdf and merge some of the gates
-                    # into the UCGates and the diagonal at the end of the circuit
-
-                    # Remark: The Rz(pi/2) rotation acting on the target qubit and the Hadamard
-                    # gates arising in the decomposition of D are ignored for the moment (they will
-                    # be added together with the C-NOT gates at the end of the decomposition
-                    # (in the method dec_ucg()))
-                    if ucg_index < num_ucgs - 1:
-                        # Absorb the Rz(pi/2) rotation on the control into the UC-Rz gate and
-                        # merge the UC-Rz rotation with the following UCGate,
-                        # which hasn't been decomposed yet.
-                        k = shift + len_ucg + i
-                        single_qubit_gates[k] = single_qubit_gates[k].dot(
-                            UCGate._ct(r)
-                        ) * UCGate._rz(np.pi / 2).item((0, 0))
-                        k = k + len_ucg // 2
-                        single_qubit_gates[k] = single_qubit_gates[k].dot(r) * UCGate._rz(
-                            np.pi / 2
-                        ).item((1, 1))
-                    else:
-                        # Absorb the Rz(pi/2) rotation on the control into the UC-Rz gate and merge
-                        # the trailing UC-Rz rotation into a diagonal gate at the end of the circuit
-                        for ucg_index_2 in range(num_ucgs):
-                            shift_2 = ucg_index_2 * len_ucg
-                            k = 2 * (i + shift_2)
-                            diag[k] = (
-                                diag[k]
-                                * UCGate._ct(r).item((0, 0))
-                                * UCGate._rz(np.pi / 2).item((0, 0))
-                            )
-                            diag[k + 1] = (
-                                diag[k + 1]
-                                * UCGate._ct(r).item((1, 1))
-                                * UCGate._rz(np.pi / 2).item((0, 0))
-                            )
-                            k = len_ucg + k
-                            diag[k] *= r.item((0, 0)) * UCGate._rz(np.pi / 2).item((1, 1))
-                            diag[k + 1] *= r.item((1, 1)) * UCGate._rz(np.pi / 2).item((1, 1))
-        return single_qubit_gates, diag
-
-    def _demultiplex_single_uc(self, a, b):
-        """
-        This method implements the decomposition given in equation (3) in
-        https://arxiv.org/pdf/quant-ph/0410066.pdf.
-        The decomposition is used recursively to decompose uniformly controlled gates.
-        a,b = single qubit unitaries
-        v,u,r = outcome of the decomposition given in the reference mentioned above
-        (see there for the details).
-        """
-        # The notation is chosen as in https://arxiv.org/pdf/quant-ph/0410066.pdf.
-        x = a.dot(UCGate._ct(b))
-        det_x = np.linalg.det(x)
-        x11 = x.item((0, 0)) / cmath.sqrt(det_x)
-        phi = cmath.phase(det_x)
-        r1 = cmath.exp(1j / 2 * (np.pi / 2 - phi / 2 - cmath.phase(x11)))
-        r2 = cmath.exp(1j / 2 * (np.pi / 2 - phi / 2 + cmath.phase(x11) + np.pi))
-        r = np.array([[r1, 0], [0, r2]], dtype=complex)
-        d, u = np.linalg.eig(r.dot(x).dot(r))
-        # If d is not equal to diag(i,-i), then we put it into this "standard" form
-        # (see eq. (13) in https://arxiv.org/pdf/quant-ph/0410066.pdf) by interchanging
-        # the eigenvalues and eigenvectors.
-        if abs(d[0] + 1j) < _EPS:
-            d = np.flip(d, 0)
-            u = np.flip(u, 1)
-        d = np.diag(np.sqrt(d))
-        v = d.dot(UCGate._ct(u)).dot(UCGate._ct(r)).dot(b)
-        return v, u, r
-
-    @staticmethod
-    def _ct(m):
-        return np.transpose(np.conjugate(m))
+        return uc_gate.dec_ucg_help(single_qubit_gates, self.num_qubits)
 
     @staticmethod
     def _rz(alpha):
diff --git a/qiskit/circuit/library/generalized_gates/unitary.py b/qiskit/circuit/library/generalized_gates/unitary.py
index 618041142227..1fd36e52e0c0 100644
--- a/qiskit/circuit/library/generalized_gates/unitary.py
+++ b/qiskit/circuit/library/generalized_gates/unitary.py
@@ -18,6 +18,7 @@
 import typing
 import numpy
 
+from qiskit import _numpy_compat
 from qiskit.circuit.gate import Gate
 from qiskit.circuit.controlledgate import ControlledGate
 from qiskit.circuit.annotated_operation import AnnotatedOperation, ControlModifier
@@ -118,10 +119,10 @@ def __eq__(self, other):
             return False
         return matrix_equal(self.params[0], other.params[0])
 
-    def __array__(self, dtype=None):
+    def __array__(self, dtype=None, copy=_numpy_compat.COPY_ONLY_IF_NEEDED):
         """Return matrix for the unitary."""
-        # pylint: disable=unused-argument
-        return self.params[0]
+        dtype = self.params[0].dtype if dtype is None else dtype
+        return numpy.array(self.params[0], dtype=dtype, copy=copy)
 
     def inverse(self, annotated: bool = False):
         """Return the adjoint of the unitary."""
diff --git a/qiskit/circuit/library/hamiltonian_gate.py b/qiskit/circuit/library/hamiltonian_gate.py
index a87504a97b36..2997d01ed487 100644
--- a/qiskit/circuit/library/hamiltonian_gate.py
+++ b/qiskit/circuit/library/hamiltonian_gate.py
@@ -21,6 +21,7 @@
 from numbers import Number
 import numpy as np
 
+from qiskit import _numpy_compat
 from qiskit.circuit.gate import Gate
 from qiskit.circuit.quantumcircuit import QuantumCircuit
 from qiskit.circuit.quantumregister import QuantumRegister
@@ -92,18 +93,22 @@ def __eq__(self, other):
         times_eq = self.params[1] == other.params[1]
         return operators_eq and times_eq
 
-    def __array__(self, dtype=None):
+    def __array__(self, dtype=None, copy=None):
         """Return matrix for the unitary."""
-        # pylint: disable=unused-argument
         import scipy.linalg
 
+        if copy is False:
+            raise ValueError("unable to avoid copy while creating an array as requested")
         try:
-            return scipy.linalg.expm(-1j * self.params[0] * float(self.params[1]))
+            time = float(self.params[1])
         except TypeError as ex:
             raise TypeError(
                 "Unable to generate Unitary matrix for "
                 "unbound t parameter {}".format(self.params[1])
             ) from ex
+        arr = scipy.linalg.expm(-1j * self.params[0] * time)
+        dtype = complex if dtype is None else dtype
+        return np.array(arr, dtype=dtype, copy=_numpy_compat.COPY_ONLY_IF_NEEDED)
 
     def inverse(self, annotated: bool = False):
         """Return the adjoint of the unitary."""
diff --git a/qiskit/circuit/library/n_local/efficient_su2.py b/qiskit/circuit/library/n_local/efficient_su2.py
index fc72a2a6c530..e27fe407e188 100644
--- a/qiskit/circuit/library/n_local/efficient_su2.py
+++ b/qiskit/circuit/library/n_local/efficient_su2.py
@@ -110,11 +110,11 @@ def __init__(
                 If only one gate is provided, the same gate is applied to each qubit.
                 If a list of gates is provided, all gates are applied to each qubit in the provided
                 order.
-            entanglement: Specifies the entanglement structure. Can be a string ('full', 'linear'
-                , 'reverse_linear', 'circular' or 'sca'), a list of integer-pairs specifying the indices
-                of qubits entangled with one another, or a callable returning such a list provided with
-                the index of the entanglement layer.
-                Default to 'reverse_linear' entanglement.
+            entanglement: Specifies the entanglement structure. Can be a string
+                ('full', 'linear', 'reverse_linear', 'pairwise', 'circular', or 'sca'),
+                a list of integer-pairs specifying the indices of qubits entangled with one another,
+                or a callable returning such a list provided with the index of the entanglement layer.
+                Defaults to 'reverse_linear' entanglement.
                 Note that 'reverse_linear' entanglement provides the same unitary as 'full'
                 with fewer entangling gates.
                 See the Examples section of :class:`~qiskit.circuit.library.TwoLocal` for more
diff --git a/qiskit/circuit/library/n_local/n_local.py b/qiskit/circuit/library/n_local/n_local.py
index e4f5b9be4bf4..430edfd94f39 100644
--- a/qiskit/circuit/library/n_local/n_local.py
+++ b/qiskit/circuit/library/n_local/n_local.py
@@ -13,16 +13,24 @@
 """The n-local circuit class."""
 
 from __future__ import annotations
+
+import collections
+import itertools
 import typing
 from collections.abc import Callable, Mapping, Sequence
 
-from itertools import combinations
-
 import numpy
 from qiskit.circuit.quantumcircuit import QuantumCircuit
 from qiskit.circuit.quantumregister import QuantumRegister
-from qiskit.circuit import Instruction, Parameter, ParameterVector, ParameterExpression
+from qiskit.circuit import (
+    Instruction,
+    Parameter,
+    ParameterVector,
+    ParameterExpression,
+    CircuitInstruction,
+)
 from qiskit.exceptions import QiskitError
+from qiskit.circuit.library.standard_gates import get_standard_gate_name_mapping
 
 from ..blueprintcircuit import BlueprintCircuit
 
@@ -154,6 +162,17 @@ def __init__(
         self._bounds: list[tuple[float | None, float | None]] | None = None
         self._flatten = flatten
 
+        # During the build, if a subclass hasn't overridden our parametrisation methods, we can use
+        # a newer fast-path method to parametrise the rotation and entanglement blocks if internally
+        # those are just simple stdlib gates that have been promoted to circuits.  We don't
+        # precalculate the fast-path layers themselves because there's far too much that can be
+        # overridden between object construction and build, and far too many subclasses of `NLocal`
+        # that override bits and bobs of the internal private methods, so it'd be too hard to keep
+        # everything in sync.
+        self._allow_fast_path_parametrization = (
+            getattr(self._parameter_generator, "__func__", None) is NLocal._parameter_generator
+        )
+
         if int(reps) != reps:
             raise TypeError("The value of reps should be int")
 
@@ -779,13 +798,10 @@ def add_layer(
             else:
                 entangler_map = entanglement
 
-            layer = QuantumCircuit(self.num_qubits)
             for i in entangler_map:
                 params = self.ordered_parameters[-len(get_parameters(block)) :]
                 parameterized_block = self._parameterize_block(block, params=params)
-                layer.compose(parameterized_block, i, inplace=True)
-
-            self.compose(layer, inplace=True)
+                self.compose(parameterized_block, i, inplace=True, copy=False)
         else:
             # cannot prepend a block currently, just rebuild
             self._invalidate()
@@ -843,52 +859,65 @@ def _build_rotation_layer(self, circuit, param_iter, i):
         """Build a rotation layer."""
         # if the unentangled qubits are skipped, compute the set of qubits that are not entangled
         if self._skip_unentangled_qubits:
-            unentangled_qubits = self.get_unentangled_qubits()
+            skipped_qubits = self.get_unentangled_qubits()
+        else:
+            skipped_qubits = set()
+
+        target_qubits = circuit.qubits
 
         # iterate over all rotation blocks
         for j, block in enumerate(self.rotation_blocks):
-            # create a new layer
-            layer = QuantumCircuit(*self.qregs)
-
-            # we apply the rotation gates stacked on top of each other, i.e.
-            # if we have 4 qubits and a rotation block of width 2, we apply two instances
-            block_indices = [
-                list(range(k * block.num_qubits, (k + 1) * block.num_qubits))
-                for k in range(self.num_qubits // block.num_qubits)
-            ]
-
-            # if unentangled qubits should not be acted on, remove all operations that
-            # touch an unentangled qubit
-            if self._skip_unentangled_qubits:
+            skipped_blocks = {qubit // block.num_qubits for qubit in skipped_qubits}
+            if (
+                self._allow_fast_path_parametrization
+                and (simple_block := _stdlib_gate_from_simple_block(block)) is not None
+            ):
+                all_qubits = (
+                    tuple(target_qubits[k * block.num_qubits : (k + 1) * block.num_qubits])
+                    for k in range(self.num_qubits // block.num_qubits)
+                    if k not in skipped_blocks
+                )
+                for qubits in all_qubits:
+                    instr = CircuitInstruction(
+                        simple_block.gate(*itertools.islice(param_iter, simple_block.num_params)),
+                        qubits,
+                    )
+                    circuit._append(instr)
+            else:
                 block_indices = [
-                    indices
-                    for indices in block_indices
-                    if set(indices).isdisjoint(unentangled_qubits)
+                    list(range(k * block.num_qubits, (k + 1) * block.num_qubits))
+                    for k in range(self.num_qubits // block.num_qubits)
+                    if k not in skipped_blocks
                 ]
-
-            # apply the operations in the layer
-            for indices in block_indices:
-                parameterized_block = self._parameterize_block(block, param_iter, i, j, indices)
-                layer.compose(parameterized_block, indices, inplace=True)
-
-            # add the layer to the circuit
-            circuit.compose(layer, inplace=True)
+                # apply the operations in the layer
+                for indices in block_indices:
+                    parameterized_block = self._parameterize_block(block, param_iter, i, j, indices)
+                    circuit.compose(parameterized_block, indices, inplace=True, copy=False)
 
     def _build_entanglement_layer(self, circuit, param_iter, i):
         """Build an entanglement layer."""
         # iterate over all entanglement blocks
+        target_qubits = circuit.qubits
         for j, block in enumerate(self.entanglement_blocks):
-            # create a new layer and get the entangler map for this block
-            layer = QuantumCircuit(*self.qregs)
             entangler_map = self.get_entangler_map(i, j, block.num_qubits)
-
-            # apply the operations in the layer
-            for indices in entangler_map:
-                parameterized_block = self._parameterize_block(block, param_iter, i, j, indices)
-                layer.compose(parameterized_block, indices, inplace=True)
-
-            # add the layer to the circuit
-            circuit.compose(layer, inplace=True)
+            if (
+                self._allow_fast_path_parametrization
+                and (simple_block := _stdlib_gate_from_simple_block(block)) is not None
+            ):
+                for indices in entangler_map:
+                    # It's actually nontrivially faster to use a listcomp and pass that to `tuple`
+                    # than to pass a generator expression directly.
+                    # pylint: disable=consider-using-generator
+                    instr = CircuitInstruction(
+                        simple_block.gate(*itertools.islice(param_iter, simple_block.num_params)),
+                        tuple([target_qubits[i] for i in indices]),
+                    )
+                    circuit._append(instr)
+            else:
+                # apply the operations in the layer
+                for indices in entangler_map:
+                    parameterized_block = self._parameterize_block(block, param_iter, i, j, indices)
+                    circuit.compose(parameterized_block, indices, inplace=True, copy=False)
 
     def _build_additional_layers(self, circuit, which):
         if which == "appended":
@@ -901,13 +930,10 @@ def _build_additional_layers(self, circuit, which):
             raise ValueError("`which` must be either `appended` or `prepended`.")
 
         for block, ent in zip(blocks, entanglements):
-            layer = QuantumCircuit(*self.qregs)
             if isinstance(ent, str):
                 ent = get_entangler_map(block.num_qubits, self.num_qubits, ent)
             for indices in ent:
-                layer.compose(block, indices, inplace=True)
-
-            circuit.compose(layer, inplace=True)
+                circuit.compose(block, indices, inplace=True, copy=False)
 
     def _build(self) -> None:
         """If not already built, build the circuit."""
@@ -926,7 +952,7 @@ def _build(self) -> None:
 
         # use the initial state as starting circuit, if it is set
         if self.initial_state:
-            circuit.compose(self.initial_state.copy(), inplace=True)
+            circuit.compose(self.initial_state.copy(), inplace=True, copy=False)
 
         param_iter = iter(self.ordered_parameters)
 
@@ -972,7 +998,7 @@ def _build(self) -> None:
             except QiskitError:
                 block = circuit.to_instruction()
 
-            self.append(block, self.qubits)
+            self.append(block, self.qubits, copy=False)
 
     # pylint: disable=unused-argument
     def _parameter_generator(self, rep: int, block: int, indices: list[int]) -> Parameter | None:
@@ -1023,7 +1049,7 @@ def get_entangler_map(
         raise ValueError("Pairwise entanglement is not defined for blocks with more than 2 qubits.")
 
     if entanglement == "full":
-        return list(combinations(list(range(n)), m))
+        return list(itertools.combinations(list(range(n)), m))
     elif entanglement == "reverse_linear":
         # reverse linear connectivity. In the case of m=2 and the entanglement_block='cx'
         # then it's equivalent to 'full' entanglement
@@ -1057,3 +1083,28 @@ def get_entangler_map(
 
     else:
         raise ValueError(f"Unsupported entanglement type: {entanglement}")
+
+
+_StdlibGateResult = collections.namedtuple("_StdlibGateResult", ("gate", "num_params"))
+_STANDARD_GATE_MAPPING = get_standard_gate_name_mapping()
+
+
+def _stdlib_gate_from_simple_block(block: QuantumCircuit) -> _StdlibGateResult | None:
+    if block.global_phase != 0.0 or len(block) != 1:
+        return None
+    instruction = block.data[0]
+    # If the single instruction isn't a standard-library gate that spans the full width of the block
+    # in the correct order, we're not simple.  If the gate isn't fully parametrised with pure,
+    # unique `Parameter` instances (expressions are too complex) that are in order, we're not
+    # simple.
+    if (
+        instruction.clbits
+        or tuple(instruction.qubits) != tuple(block.qubits)
+        or (
+            getattr(_STANDARD_GATE_MAPPING.get(instruction.operation.name), "base_class", None)
+            is not instruction.operation.base_class
+        )
+        or tuple(instruction.operation.params) != tuple(block.parameters)
+    ):
+        return None
+    return _StdlibGateResult(instruction.operation.base_class, len(instruction.operation.params))
diff --git a/qiskit/circuit/library/overlap.py b/qiskit/circuit/library/overlap.py
index ed86d8abb9a2..38f5fb9184e1 100644
--- a/qiskit/circuit/library/overlap.py
+++ b/qiskit/circuit/library/overlap.py
@@ -26,11 +26,11 @@ class UnitaryOverlap(QuantumCircuit):
     names `"p1"` (for circuit ``unitary1``) and `"p2"` (for circuit ``unitary_2``) in the output
     circuit.
 
-    This circuit is usually employed in computing the fidelity::
+    This circuit is usually employed in computing the fidelity:
 
-        .. math::
+    .. math::
 
-            \left|\langle 0| U_2^{\dag} U_1|0\rangle\right|^{2}
+        \left|\langle 0| U_2^{\dag} U_1|0\rangle\right|^{2}
 
     by computing the probability of being in the all-zeros bit-string, or equivalently,
     the expectation value of projector :math:`|0\rangle\langle 0|`.
diff --git a/qiskit/circuit/library/standard_gates/global_phase.py b/qiskit/circuit/library/standard_gates/global_phase.py
index 50576bf17ab4..ccd758e47241 100644
--- a/qiskit/circuit/library/standard_gates/global_phase.py
+++ b/qiskit/circuit/library/standard_gates/global_phase.py
@@ -69,10 +69,12 @@ def inverse(self, annotated: bool = False):
         """
         return GlobalPhaseGate(-self.params[0])
 
-    def __array__(self, dtype=complex):
+    def __array__(self, dtype=None, copy=None):
         """Return a numpy.array for the global_phase gate."""
+        if copy is False:
+            raise ValueError("unable to avoid copy while creating an array as requested")
         theta = self.params[0]
-        return numpy.array([[numpy.exp(1j * theta)]], dtype=dtype)
+        return numpy.array([[numpy.exp(1j * theta)]], dtype=dtype or complex)
 
     def __eq__(self, other):
         if isinstance(other, GlobalPhaseGate):
diff --git a/qiskit/circuit/library/standard_gates/p.py b/qiskit/circuit/library/standard_gates/p.py
index 179be025bcd0..6de0307dc798 100644
--- a/qiskit/circuit/library/standard_gates/p.py
+++ b/qiskit/circuit/library/standard_gates/p.py
@@ -140,8 +140,10 @@ def inverse(self, annotated: bool = False):
         """
         return PhaseGate(-self.params[0])
 
-    def __array__(self, dtype=None):
+    def __array__(self, dtype=None, copy=None):
         """Return a numpy.array for the Phase gate."""
+        if copy is False:
+            raise ValueError("unable to avoid copy while creating an array as requested")
         lam = float(self.params[0])
         return numpy.array([[1, 0], [0, exp(1j * lam)]], dtype=dtype)
 
@@ -279,8 +281,10 @@ def inverse(self, annotated: bool = False):
         r"""Return inverted CPhase gate (:math:`CPhase(\lambda)^{\dagger} = CPhase(-\lambda)`)"""
         return CPhaseGate(-self.params[0], ctrl_state=self.ctrl_state)
 
-    def __array__(self, dtype=None):
+    def __array__(self, dtype=None, copy=None):
         """Return a numpy.array for the CPhase gate."""
+        if copy is False:
+            raise ValueError("unable to avoid copy while creating an array as requested")
         eith = exp(1j * float(self.params[0]))
         if self.ctrl_state:
             return numpy.array(
diff --git a/qiskit/circuit/library/standard_gates/r.py b/qiskit/circuit/library/standard_gates/r.py
index 3fc537baef90..9d4905e27866 100644
--- a/qiskit/circuit/library/standard_gates/r.py
+++ b/qiskit/circuit/library/standard_gates/r.py
@@ -93,8 +93,10 @@ def inverse(self, annotated: bool = False):
         """
         return RGate(-self.params[0], self.params[1])
 
-    def __array__(self, dtype=None):
+    def __array__(self, dtype=None, copy=None):
         """Return a numpy.array for the R gate."""
+        if copy is False:
+            raise ValueError("unable to avoid copy while creating an array as requested")
         theta, phi = float(self.params[0]), float(self.params[1])
         cos = math.cos(theta / 2)
         sin = math.sin(theta / 2)
diff --git a/qiskit/circuit/library/standard_gates/rx.py b/qiskit/circuit/library/standard_gates/rx.py
index 3483d5ebc956..eaa73cf87c91 100644
--- a/qiskit/circuit/library/standard_gates/rx.py
+++ b/qiskit/circuit/library/standard_gates/rx.py
@@ -120,8 +120,10 @@ def inverse(self, annotated: bool = False):
         """
         return RXGate(-self.params[0])
 
-    def __array__(self, dtype=None):
+    def __array__(self, dtype=None, copy=None):
         """Return a numpy.array for the RX gate."""
+        if copy is False:
+            raise ValueError("unable to avoid copy while creating an array as requested")
         cos = math.cos(self.params[0] / 2)
         sin = math.sin(self.params[0] / 2)
         return numpy.array([[cos, -1j * sin], [-1j * sin, cos]], dtype=dtype)
@@ -263,8 +265,10 @@ def inverse(self, annotated: bool = False):
         """
         return CRXGate(-self.params[0], ctrl_state=self.ctrl_state)
 
-    def __array__(self, dtype=None):
+    def __array__(self, dtype=None, copy=None):
         """Return a numpy.array for the CRX gate."""
+        if copy is False:
+            raise ValueError("unable to avoid copy while creating an array as requested")
         half_theta = float(self.params[0]) / 2
         cos = math.cos(half_theta)
         isin = 1j * math.sin(half_theta)
diff --git a/qiskit/circuit/library/standard_gates/rxx.py b/qiskit/circuit/library/standard_gates/rxx.py
index 03e9d22dcc24..c4e35e53d55e 100644
--- a/qiskit/circuit/library/standard_gates/rxx.py
+++ b/qiskit/circuit/library/standard_gates/rxx.py
@@ -122,8 +122,10 @@ def inverse(self, annotated: bool = False):
         """
         return RXXGate(-self.params[0])
 
-    def __array__(self, dtype=None):
+    def __array__(self, dtype=None, copy=None):
         """Return a Numpy.array for the RXX gate."""
+        if copy is False:
+            raise ValueError("unable to avoid copy while creating an array as requested")
         theta2 = float(self.params[0]) / 2
         cos = math.cos(theta2)
         isin = 1j * math.sin(theta2)
diff --git a/qiskit/circuit/library/standard_gates/ry.py b/qiskit/circuit/library/standard_gates/ry.py
index b902887ee0e0..633a518bca77 100644
--- a/qiskit/circuit/library/standard_gates/ry.py
+++ b/qiskit/circuit/library/standard_gates/ry.py
@@ -119,8 +119,10 @@ def inverse(self, annotated: bool = False):
         """
         return RYGate(-self.params[0])
 
-    def __array__(self, dtype=None):
+    def __array__(self, dtype=None, copy=None):
         """Return a numpy.array for the RY gate."""
+        if copy is False:
+            raise ValueError("unable to avoid copy while creating an array as requested")
         cos = math.cos(self.params[0] / 2)
         sin = math.sin(self.params[0] / 2)
         return numpy.array([[cos, -sin], [sin, cos]], dtype=dtype)
@@ -258,8 +260,10 @@ def inverse(self, annotated: bool = False):
         ."""
         return CRYGate(-self.params[0], ctrl_state=self.ctrl_state)
 
-    def __array__(self, dtype=None):
+    def __array__(self, dtype=None, copy=None):
         """Return a numpy.array for the CRY gate."""
+        if copy is False:
+            raise ValueError("unable to avoid copy while creating an array as requested")
         half_theta = float(self.params[0]) / 2
         cos = math.cos(half_theta)
         sin = math.sin(half_theta)
diff --git a/qiskit/circuit/library/standard_gates/ryy.py b/qiskit/circuit/library/standard_gates/ryy.py
index 50ce9b0c4f73..98847b7b2182 100644
--- a/qiskit/circuit/library/standard_gates/ryy.py
+++ b/qiskit/circuit/library/standard_gates/ryy.py
@@ -122,8 +122,10 @@ def inverse(self, annotated: bool = False):
         """
         return RYYGate(-self.params[0])
 
-    def __array__(self, dtype=None):
+    def __array__(self, dtype=None, copy=None):
         """Return a numpy.array for the RYY gate."""
+        if copy is False:
+            raise ValueError("unable to avoid copy while creating an array as requested")
         theta = float(self.params[0])
         cos = math.cos(theta / 2)
         isin = 1j * math.sin(theta / 2)
diff --git a/qiskit/circuit/library/standard_gates/rz.py b/qiskit/circuit/library/standard_gates/rz.py
index c7311b4a6e59..3040f9568346 100644
--- a/qiskit/circuit/library/standard_gates/rz.py
+++ b/qiskit/circuit/library/standard_gates/rz.py
@@ -130,10 +130,12 @@ def inverse(self, annotated: bool = False):
         """
         return RZGate(-self.params[0])
 
-    def __array__(self, dtype=None):
+    def __array__(self, dtype=None, copy=None):
         """Return a numpy.array for the RZ gate."""
         import numpy as np
 
+        if copy is False:
+            raise ValueError("unable to avoid copy while creating an array as requested")
         ilam2 = 0.5j * float(self.params[0])
         return np.array([[exp(-ilam2), 0], [0, exp(ilam2)]], dtype=dtype)
 
@@ -276,10 +278,12 @@ def inverse(self, annotated: bool = False):
         """
         return CRZGate(-self.params[0], ctrl_state=self.ctrl_state)
 
-    def __array__(self, dtype=None):
+    def __array__(self, dtype=None, copy=None):
         """Return a numpy.array for the CRZ gate."""
         import numpy
 
+        if copy is False:
+            raise ValueError("unable to avoid copy while creating an array as requested")
         arg = 1j * float(self.params[0]) / 2
         if self.ctrl_state:
             return numpy.array(
diff --git a/qiskit/circuit/library/standard_gates/rzx.py b/qiskit/circuit/library/standard_gates/rzx.py
index d59676663da1..1f930ab422df 100644
--- a/qiskit/circuit/library/standard_gates/rzx.py
+++ b/qiskit/circuit/library/standard_gates/rzx.py
@@ -166,10 +166,12 @@ def inverse(self, annotated: bool = False):
         """
         return RZXGate(-self.params[0])
 
-    def __array__(self, dtype=None):
+    def __array__(self, dtype=None, copy=None):
         """Return a numpy.array for the RZX gate."""
         import numpy
 
+        if copy is False:
+            raise ValueError("unable to avoid copy while creating an array as requested")
         half_theta = float(self.params[0]) / 2
         cos = math.cos(half_theta)
         isin = 1j * math.sin(half_theta)
diff --git a/qiskit/circuit/library/standard_gates/rzz.py b/qiskit/circuit/library/standard_gates/rzz.py
index 3a00fb7b7395..5ca974764d32 100644
--- a/qiskit/circuit/library/standard_gates/rzz.py
+++ b/qiskit/circuit/library/standard_gates/rzz.py
@@ -130,10 +130,12 @@ def inverse(self, annotated: bool = False):
         """
         return RZZGate(-self.params[0])
 
-    def __array__(self, dtype=None):
+    def __array__(self, dtype=None, copy=None):
         """Return a numpy.array for the RZZ gate."""
         import numpy
 
+        if copy is False:
+            raise ValueError("unable to avoid copy while creating an array as requested")
         itheta2 = 1j * float(self.params[0]) / 2
         return numpy.array(
             [
diff --git a/qiskit/circuit/library/standard_gates/u.py b/qiskit/circuit/library/standard_gates/u.py
index 81b48536f26b..3d631898850a 100644
--- a/qiskit/circuit/library/standard_gates/u.py
+++ b/qiskit/circuit/library/standard_gates/u.py
@@ -12,7 +12,7 @@
 
 """Two-pulse single-qubit gate."""
 import cmath
-import copy
+import copy as _copy
 import math
 from cmath import exp
 from typing import Optional, Union
@@ -136,8 +136,10 @@ def control(
             )
         return gate
 
-    def __array__(self, dtype=complex):
+    def __array__(self, dtype=None, copy=None):
         """Return a numpy.array for the U gate."""
+        if copy is False:
+            raise ValueError("unable to avoid copy while creating an array as requested")
         theta, phi, lam = (float(param) for param in self.params)
         cos = math.cos(theta / 2)
         sin = math.sin(theta / 2)
@@ -146,7 +148,7 @@ def __array__(self, dtype=complex):
                 [cos, -exp(1j * lam) * sin],
                 [exp(1j * phi) * sin, exp(1j * (phi + lam)) * cos],
             ],
-            dtype=dtype,
+            dtype=dtype or complex,
         )
 
     def __eq__(self, other):
@@ -337,8 +339,10 @@ def inverse(self, annotated: bool = False):
             ctrl_state=self.ctrl_state,
         )
 
-    def __array__(self, dtype=None):
+    def __array__(self, dtype=None, copy=None):
         """Return a numpy.array for the CU gate."""
+        if copy is False:
+            raise ValueError("unable to avoid copy while creating an array as requested")
         theta, phi, lam, gamma = (float(param) for param in self.params)
         cos = math.cos(theta / 2)
         sin = math.sin(theta / 2)
@@ -372,5 +376,5 @@ def __deepcopy__(self, memo=None):
         # assuming that `params` will be a view onto the base gate's `_params`.
         memo = memo if memo is not None else {}
         out = super().__deepcopy__(memo)
-        out._params = copy.deepcopy(out._params, memo)
+        out._params = _copy.deepcopy(out._params, memo)
         return out
diff --git a/qiskit/circuit/library/standard_gates/u1.py b/qiskit/circuit/library/standard_gates/u1.py
index b92bea51a26f..1d59cabae1f6 100644
--- a/qiskit/circuit/library/standard_gates/u1.py
+++ b/qiskit/circuit/library/standard_gates/u1.py
@@ -160,8 +160,10 @@ def inverse(self, annotated: bool = False):
         """
         return U1Gate(-self.params[0])
 
-    def __array__(self, dtype=None):
+    def __array__(self, dtype=None, copy=None):
         """Return a numpy.array for the U1 gate."""
+        if copy is False:
+            raise ValueError("unable to avoid copy while creating an array as requested")
         lam = float(self.params[0])
         return numpy.array([[1, 0], [0, numpy.exp(1j * lam)]], dtype=dtype)
 
@@ -304,8 +306,10 @@ def inverse(self, annotated: bool = False):
         """
         return CU1Gate(-self.params[0], ctrl_state=self.ctrl_state)
 
-    def __array__(self, dtype=None):
+    def __array__(self, dtype=None, copy=None):
         """Return a numpy.array for the CU1 gate."""
+        if copy is False:
+            raise ValueError("unable to avoid copy while creating an array as requested")
         eith = exp(1j * float(self.params[0]))
         if self.ctrl_state:
             return numpy.array(
diff --git a/qiskit/circuit/library/standard_gates/u2.py b/qiskit/circuit/library/standard_gates/u2.py
index 021a38f4daeb..c8e4de96efec 100644
--- a/qiskit/circuit/library/standard_gates/u2.py
+++ b/qiskit/circuit/library/standard_gates/u2.py
@@ -127,8 +127,10 @@ def inverse(self, annotated: bool = False):
         """
         return U2Gate(-self.params[1] - pi, -self.params[0] + pi)
 
-    def __array__(self, dtype=complex):
+    def __array__(self, dtype=None, copy=None):
         """Return a Numpy.array for the U2 gate."""
+        if copy is False:
+            raise ValueError("unable to avoid copy while creating an array as requested")
         isqrt2 = 1 / sqrt(2)
         phi, lam = self.params
         phi, lam = float(phi), float(lam)
@@ -137,5 +139,5 @@ def __array__(self, dtype=complex):
                 [isqrt2, -exp(1j * lam) * isqrt2],
                 [exp(1j * phi) * isqrt2, exp(1j * (phi + lam)) * isqrt2],
             ],
-            dtype=dtype,
+            dtype=dtype or complex,
         )
diff --git a/qiskit/circuit/library/standard_gates/u3.py b/qiskit/circuit/library/standard_gates/u3.py
index c92a48ab52b6..62c1e33b9628 100644
--- a/qiskit/circuit/library/standard_gates/u3.py
+++ b/qiskit/circuit/library/standard_gates/u3.py
@@ -149,8 +149,10 @@ def _define(self):
         qc.u(self.params[0], self.params[1], self.params[2], 0)
         self.definition = qc
 
-    def __array__(self, dtype=complex):
+    def __array__(self, dtype=None, copy=None):
         """Return a Numpy.array for the U3 gate."""
+        if copy is False:
+            raise ValueError("unable to avoid copy while creating an array as requested")
         theta, phi, lam = self.params
         theta, phi, lam = float(theta), float(phi), float(lam)
         cos = math.cos(theta / 2)
@@ -160,7 +162,7 @@ def __array__(self, dtype=complex):
                 [cos, -exp(1j * lam) * sin],
                 [exp(1j * phi) * sin, exp(1j * (phi + lam)) * cos],
             ],
-            dtype=dtype,
+            dtype=dtype or complex,
         )
 
 
@@ -305,8 +307,10 @@ def inverse(self, annotated: bool = False):
             -self.params[0], -self.params[2], -self.params[1], ctrl_state=self.ctrl_state
         )
 
-    def __array__(self, dtype=complex):
+    def __array__(self, dtype=None, copy=None):
         """Return a numpy.array for the CU3 gate."""
+        if copy is False:
+            raise ValueError("unable to avoid copy while creating an array as requested")
         theta, phi, lam = self.params
         theta, phi, lam = float(theta), float(phi), float(lam)
         cos = math.cos(theta / 2)
@@ -319,7 +323,7 @@ def __array__(self, dtype=complex):
                     [0, 0, 1, 0],
                     [0, exp(1j * phi) * sin, 0, exp(1j * (phi + lam)) * cos],
                 ],
-                dtype=dtype,
+                dtype=dtype or complex,
             )
         else:
             return numpy.array(
@@ -329,7 +333,7 @@ def __array__(self, dtype=complex):
                     [exp(1j * phi) * sin, 0, exp(1j * (phi + lam)) * cos, 0],
                     [0, 0, 0, 1],
                 ],
-                dtype=dtype,
+                dtype=dtype or complex,
             )
 
 
diff --git a/qiskit/circuit/library/standard_gates/x.py b/qiskit/circuit/library/standard_gates/x.py
index c0eb505efba0..7195df90dc98 100644
--- a/qiskit/circuit/library/standard_gates/x.py
+++ b/qiskit/circuit/library/standard_gates/x.py
@@ -107,7 +107,7 @@ def control(
             num_ctrl_qubits: number of control qubits.
             label: An optional label for the gate [Default: ``None``]
             ctrl_state: control state expressed as integer,
-                string (e.g.``'110'``), or ``None``. If ``None``, use all 1s.
+                string (e.g. ``'110'``), or ``None``. If ``None``, use all 1s.
             annotated: indicates whether the controlled gate can be implemented
                 as an annotated gate.
 
@@ -250,7 +250,7 @@ def control(
             num_ctrl_qubits: number of control qubits.
             label: An optional label for the gate [Default: ``None``]
             ctrl_state: control state expressed as integer,
-                string (e.g.``'110'``), or ``None``. If ``None``, use all 1s.
+                string (e.g. ``'110'``), or ``None``. If ``None``, use all 1s.
             annotated: indicates whether the controlled gate can be implemented
                 as an annotated gate.
 
@@ -444,7 +444,7 @@ def control(
             num_ctrl_qubits: number of control qubits.
             label: An optional label for the gate [Default: ``None``]
             ctrl_state: control state expressed as integer,
-                string (e.g.``'110'``), or ``None``. If ``None``, use all 1s.
+                string (e.g. ``'110'``), or ``None``. If ``None``, use all 1s.
             annotated: indicates whether the controlled gate can be implemented
                 as an annotated gate.
 
@@ -585,7 +585,7 @@ def __init__(
         Args:
             label: An optional label for the gate [Default: ``None``]
             ctrl_state: control state expressed as integer,
-                string (e.g.``'110'``), or ``None``. If ``None``, use all 1s.
+                string (e.g. ``'110'``), or ``None``. If ``None``, use all 1s.
         """
         from .sx import SXGate
 
@@ -785,7 +785,7 @@ def control(
             num_ctrl_qubits: number of control qubits.
             label: An optional label for the gate [Default: ``None``]
             ctrl_state: control state expressed as integer,
-                string (e.g.``'110'``), or ``None``. If ``None``, use all 1s.
+                string (e.g. ``'110'``), or ``None``. If ``None``, use all 1s.
             annotated: indicates whether the controlled gate can be implemented
                 as an annotated gate.
 
@@ -1029,7 +1029,7 @@ def control(
             num_ctrl_qubits: number of control qubits.
             label: An optional label for the gate [Default: ``None``]
             ctrl_state: control state expressed as integer,
-                string (e.g.``'110'``), or ``None``. If ``None``, use all 1s.
+                string (e.g. ``'110'``), or ``None``. If ``None``, use all 1s.
             annotated: indicates whether the controlled gate can be implemented
                 as an annotated gate.
 
@@ -1204,7 +1204,7 @@ def control(
             num_ctrl_qubits: number of control qubits.
             label: An optional label for the gate [Default: ``None``]
             ctrl_state: control state expressed as integer,
-                string (e.g.``'110'``), or ``None``. If ``None``, use all 1s.
+                string (e.g. ``'110'``), or ``None``. If ``None``, use all 1s.
             annotated: indicates whether the controlled gate can be implemented
                 as an annotated gate.
 
diff --git a/qiskit/circuit/library/standard_gates/xx_minus_yy.py b/qiskit/circuit/library/standard_gates/xx_minus_yy.py
index 387a23ad058a..4bf4ab80eca2 100644
--- a/qiskit/circuit/library/standard_gates/xx_minus_yy.py
+++ b/qiskit/circuit/library/standard_gates/xx_minus_yy.py
@@ -169,8 +169,10 @@ def inverse(self, annotated: bool = False):
         theta, beta = self.params
         return XXMinusYYGate(-theta, beta)
 
-    def __array__(self, dtype=complex):
+    def __array__(self, dtype=None, copy=None):
         """Gate matrix."""
+        if copy is False:
+            raise ValueError("unable to avoid copy while creating an array as requested")
         theta, beta = self.params
         cos = math.cos(theta / 2)
         sin = math.sin(theta / 2)
diff --git a/qiskit/circuit/library/standard_gates/xx_plus_yy.py b/qiskit/circuit/library/standard_gates/xx_plus_yy.py
index b69ba49de30d..a7b62175f207 100644
--- a/qiskit/circuit/library/standard_gates/xx_plus_yy.py
+++ b/qiskit/circuit/library/standard_gates/xx_plus_yy.py
@@ -15,6 +15,9 @@
 from cmath import exp
 from math import pi
 from typing import Optional
+
+import numpy
+
 from qiskit.circuit.gate import Gate
 from qiskit.circuit.quantumregister import QuantumRegister
 from qiskit.circuit.parameterexpression import ParameterValueType
@@ -167,10 +170,10 @@ def inverse(self, annotated: bool = False):
         """
         return XXPlusYYGate(-self.params[0], self.params[1])
 
-    def __array__(self, dtype=complex):
+    def __array__(self, dtype=None, copy=None):
         """Return a numpy.array for the XX+YY gate."""
-        import numpy
-
+        if copy is False:
+            raise ValueError("unable to avoid copy while creating an array as requested")
         half_theta = float(self.params[0]) / 2
         beta = float(self.params[1])
         cos = math.cos(half_theta)
diff --git a/qiskit/circuit/quantumcircuit.py b/qiskit/circuit/quantumcircuit.py
index f25dca4b03b4..ad966b685e71 100644
--- a/qiskit/circuit/quantumcircuit.py
+++ b/qiskit/circuit/quantumcircuit.py
@@ -883,11 +883,31 @@ def compose(
         wrap: bool = False,
         *,
         copy: bool = True,
+        var_remap: Mapping[str | expr.Var, str | expr.Var] | None = None,
+        inline_captures: bool = False,
     ) -> Optional["QuantumCircuit"]:
         """Compose circuit with ``other`` circuit or instruction, optionally permuting wires.
 
         ``other`` can be narrower or of equal width to ``self``.
 
+        When dealing with realtime variables (:class:`.expr.Var` instances), there are two principal
+        strategies for using :meth:`compose`:
+
+        1. The ``other`` circuit is treated as entirely additive, including its variables.  The
+           variables in ``other`` must be entirely distinct from those in ``self`` (use
+           ``var_remap`` to help with this), and all variables in ``other`` will be declared anew in
+           the output with matching input/capture/local scoping to how they are in ``other``.  This
+           is generally what you want if you're joining two unrelated circuits.
+
+        2. The ``other`` circuit was created as an exact extension to ``self`` to be inlined onto
+           it, including acting on the existing variables in their states at the end of ``self``.
+           In this case, ``other`` should be created with all these variables to be inlined declared
+           as "captures", and then you can use ``inline_captures=True`` in this method to link them.
+           This is generally what you want if you're building up a circuit by defining layers
+           on-the-fly, or rebuilding a circuit using layers taken from itself.  You might find the
+           ``vars_mode="captures"`` argument to :meth:`copy_empty_like` useful to create each
+           layer's base, in this case.
+
         Args:
             other (qiskit.circuit.Instruction or QuantumCircuit):
                 (sub)circuit or instruction to compose onto self.  If not a :obj:`.QuantumCircuit`,
@@ -905,6 +925,24 @@ def compose(
                 the base circuit, in order to avoid unnecessary copies; in this case, it is not
                 valid to use ``other`` afterwards, and some instructions may have been mutated in
                 place.
+            var_remap (Mapping): mapping to use to rewrite :class:`.expr.Var` nodes in ``other`` as
+                they are inlined into ``self``.  This can be used to avoid naming conflicts.
+
+                Both keys and values can be given as strings or direct :class:`.expr.Var` instances.
+                If a key is a string, it matches any :class:`~.expr.Var` with the same name.  If a
+                value is a string, whenever a new key matches a it, a new :class:`~.expr.Var` is
+                created with the correct type.  If a value is a :class:`~.expr.Var`, its
+                :class:`~.expr.Expr.type` must exactly match that of the variable it is replacing.
+            inline_captures (bool): if ``True``, then all "captured" :class:`~.expr.Var` nodes in
+                the ``other`` :class:`.QuantumCircuit` are assumed to refer to variables already
+                declared in ``self`` (as any input/capture/local type), and the uses in ``other``
+                will apply to the existing variables.  If you want to build up a layer for an
+                existing circuit to use with :meth:`compose`, you might find the
+                ``vars_mode="captures"`` argument to :meth:`copy_empty_like` useful.  Any remapping
+                in ``vars_remap`` occurs before evaluating this variable inlining.
+
+                If this is ``False`` (the default), then all variables in ``other`` will be required
+                to be distinct from those in ``self``, and new declarations will be made for them.
 
         Returns:
             QuantumCircuit: the composed circuit (returns None if inplace==True).
@@ -961,6 +999,31 @@ def compose(
         # error that the user might want to correct in an interactive session.
         dest = self if inplace else self.copy()
 
+        var_remap = {} if var_remap is None else var_remap
+
+        # This doesn't use `functools.cache` so we can access it during the variable remapping of
+        # instructions.  We cache all replacement lookups for a) speed and b) to ensure that
+        # the same variable _always_ maps to the same replacement even if it's used in different
+        # places in the recursion tree (such as being a captured variable).
+        def replace_var(var: expr.Var, cache: Mapping[expr.Var, expr.Var]) -> expr.Var:
+            # This is closing over an argument to `compose`.
+            nonlocal var_remap
+
+            if out := cache.get(var):
+                return out
+            if (replacement := var_remap.get(var)) or (replacement := var_remap.get(var.name)):
+                if isinstance(replacement, str):
+                    replacement = expr.Var.new(replacement, var.type)
+                if replacement.type != var.type:
+                    raise CircuitError(
+                        f"mismatched types in replacement for '{var.name}':"
+                        f" '{var.type}' cannot become '{replacement.type}'"
+                    )
+            else:
+                replacement = var
+            cache[var] = replacement
+            return replacement
+
         # As a special case, allow composing some clbits onto no clbits - normally the destination
         # has to be strictly larger. This allows composing final measurements onto unitary circuits.
         if isinstance(other, QuantumCircuit):
@@ -1044,38 +1107,100 @@ def compose(
         dest.unit = "dt"
         dest.global_phase += other.global_phase
 
-        if not other.data:
-            # Nothing left to do. Plus, accessing 'data' here is necessary
-            # to trigger any lazy building since we now access '_data'
-            # directly.
-            return None if inplace else dest
+        # This is required to trigger data builds if the `other` is an unbuilt `BlueprintCircuit`,
+        # so we can the access the complete `CircuitData` object at `_data`.
+        _ = other.data
 
-        variable_mapper = _classical_resource_map.VariableMapper(
-            dest.cregs, edge_map, dest.add_register
-        )
+        def copy_with_remapping(
+            source, dest, bit_map, var_map, inline_captures, new_qubits=None, new_clbits=None
+        ):
+            # Copy the instructions from `source` into `dest`, remapping variables in instructions
+            # according to `var_map`.  If `new_qubits` or `new_clbits` are given, the qubits and
+            # clbits of the source instruction are remapped to those as well.
+            for var in source.iter_input_vars():
+                dest.add_input(replace_var(var, var_map))
+            if inline_captures:
+                for var in source.iter_captured_vars():
+                    replacement = replace_var(var, var_map)
+                    if not dest.has_var(replace_var(var, var_map)):
+                        if var is replacement:
+                            raise CircuitError(
+                                f"Variable '{var}' to be inlined is not in the base circuit."
+                                " If you wanted it to be automatically added, use"
+                                " `inline_captures=False`."
+                            )
+                        raise CircuitError(
+                            f"Replacement '{replacement}' for variable '{var}' is not in the"
+                            " base circuit.  Is the replacement correct?"
+                        )
+            else:
+                for var in source.iter_captured_vars():
+                    dest.add_capture(replace_var(var, var_map))
+            for var in source.iter_declared_vars():
+                dest.add_uninitialized_var(replace_var(var, var_map))
+
+            def recurse_block(block):
+                # Recurse the remapping into a control-flow block.  Note that this doesn't remap the
+                # clbits within; the story around nested classical-register-based control-flow
+                # doesn't really work in the current data model, and we hope to replace it with
+                # `Expr`-based control-flow everywhere.
+                new_block = block.copy_empty_like()
+                new_block._vars_input = {}
+                new_block._vars_capture = {}
+                new_block._vars_local = {}
+                # For the recursion, we never want to inline captured variables because we're not
+                # copying onto a base that has variables.
+                copy_with_remapping(block, new_block, bit_map, var_map, inline_captures=False)
+                return new_block
+
+            variable_mapper = _classical_resource_map.VariableMapper(
+                dest.cregs, bit_map, var_map, add_register=dest.add_register
+            )
 
-        def map_vars(op):
-            n_op = op.copy() if copy else op
-            if (condition := getattr(n_op, "condition", None)) is not None:
-                n_op.condition = variable_mapper.map_condition(condition)
-            if isinstance(n_op, SwitchCaseOp):
-                n_op = n_op.copy() if n_op is op else n_op
-                n_op.target = variable_mapper.map_target(n_op.target)
-            return n_op
+            def map_vars(op):
+                n_op = op
+                is_control_flow = isinstance(n_op, ControlFlowOp)
+                if (
+                    not is_control_flow
+                    and (condition := getattr(n_op, "condition", None)) is not None
+                ):
+                    n_op = n_op.copy() if n_op is op and copy else n_op
+                    n_op.condition = variable_mapper.map_condition(condition)
+                elif is_control_flow:
+                    n_op = n_op.replace_blocks(recurse_block(block) for block in n_op.blocks)
+                    if isinstance(n_op, (IfElseOp, WhileLoopOp)):
+                        n_op.condition = variable_mapper.map_condition(n_op.condition)
+                    elif isinstance(n_op, SwitchCaseOp):
+                        n_op.target = variable_mapper.map_target(n_op.target)
+                elif isinstance(n_op, Store):
+                    n_op = Store(
+                        variable_mapper.map_expr(n_op.lvalue), variable_mapper.map_expr(n_op.rvalue)
+                    )
+                return n_op.copy() if n_op is op and copy else n_op
 
-        mapped_instrs: CircuitData = other._data.copy()
-        mapped_instrs.replace_bits(qubits=mapped_qubits, clbits=mapped_clbits)
-        mapped_instrs.map_ops(map_vars)
+            instructions = source._data.copy()
+            instructions.replace_bits(qubits=new_qubits, clbits=new_clbits)
+            instructions.map_ops(map_vars)
+            dest._current_scope().extend(instructions)
 
         append_existing = None
         if front:
             append_existing = dest._data.copy()
             dest.clear()
-
-        circuit_scope = dest._current_scope()
-        circuit_scope.extend(mapped_instrs)
+        copy_with_remapping(
+            other,
+            dest,
+            bit_map=edge_map,
+            # The actual `Var: Var` map gets built up from the more freeform user input as we
+            # encounter the variables, since the user might be using string keys to refer to more
+            # than one variable in separated scopes of control-flow operations.
+            var_map={},
+            inline_captures=inline_captures,
+            new_qubits=mapped_qubits,
+            new_clbits=mapped_clbits,
+        )
         if append_existing:
-            circuit_scope.extend(append_existing)
+            dest._current_scope().extend(append_existing)
 
         return None if inplace else dest
 
@@ -1765,7 +1890,18 @@ def add_var(self, name_or_var: str | expr.Var, /, initial: typing.Any) -> expr.V
         # Validate the initialiser first to catch cases where the variable to be declared is being
         # used in the initialiser.
         circuit_scope = self._current_scope()
-        initial = _validate_expr(circuit_scope, expr.lift(initial))
+        # Convenience method to widen Python integer literals to the right width during the initial
+        # lift, if the type is already known via the variable.
+        if (
+            isinstance(name_or_var, expr.Var)
+            and name_or_var.type.kind is types.Uint
+            and isinstance(initial, int)
+            and not isinstance(initial, bool)
+        ):
+            coerce_type = name_or_var.type
+        else:
+            coerce_type = None
+        initial = _validate_expr(circuit_scope, expr.lift(initial, coerce_type))
         if isinstance(name_or_var, str):
             var = expr.Var.new(name_or_var, initial.type)
         elif not name_or_var.standalone:
@@ -2509,7 +2645,7 @@ def num_tensor_factors(self) -> int:
         """
         return self.num_unitary_factors()
 
-    def copy(self, name: str | None = None) -> "QuantumCircuit":
+    def copy(self, name: str | None = None) -> typing.Self:
         """Copy the circuit.
 
         Args:
@@ -2545,24 +2681,47 @@ def memo_copy(op):
         )
         return cpy
 
-    def copy_empty_like(self, name: str | None = None) -> "QuantumCircuit":
+    def copy_empty_like(
+        self,
+        name: str | None = None,
+        *,
+        vars_mode: Literal["alike", "captures", "drop"] = "alike",
+    ) -> typing.Self:
         """Return a copy of self with the same structure but empty.
 
         That structure includes:
-            * name, calibrations and other metadata
-            * global phase
-            * all the qubits and clbits, including the registers
+
+        * name, calibrations and other metadata
+        * global phase
+        * all the qubits and clbits, including the registers
+        * the realtime variables defined in the circuit, handled according to the ``vars`` keyword
+          argument.
 
         .. warning::
 
             If the circuit contains any local variable declarations (those added by the
             ``declarations`` argument to the circuit constructor, or using :meth:`add_var`), they
-            will be **uninitialized** in the output circuit.  You will need to manually add store
+            may be **uninitialized** in the output circuit.  You will need to manually add store
             instructions for them (see :class:`.Store` and :meth:`.QuantumCircuit.store`) to
             initialize them.
 
         Args:
-            name (str): Name for the copied circuit. If None, then the name stays the same.
+            name: Name for the copied circuit. If None, then the name stays the same.
+            vars_mode: The mode to handle realtime variables in.
+
+                alike
+                    The variables in the output circuit will have the same declaration semantics as
+                    in the original circuit.  For example, ``input`` variables in the source will be
+                    ``input`` variables in the output circuit.
+
+                captures
+                    All variables will be converted to captured variables.  This is useful when you
+                    are building a new layer for an existing circuit that you will want to
+                    :meth:`compose` onto the base, since :meth:`compose` can inline captures onto
+                    the base circuit (but not other variables).
+
+                drop
+                    The output circuit will have no variables defined.
 
         Returns:
             QuantumCircuit: An empty copy of self.
@@ -2580,12 +2739,23 @@ def copy_empty_like(self, name: str | None = None) -> "QuantumCircuit":
         cpy._qubit_indices = self._qubit_indices.copy()
         cpy._clbit_indices = self._clbit_indices.copy()
 
-        # Note that this causes the local variables to be uninitialised, because the stores are not
-        # copied.  This can leave the circuit in a potentially dangerous state for users if they
-        # don't re-add initialiser stores.
-        cpy._vars_local = self._vars_local.copy()
-        cpy._vars_input = self._vars_input.copy()
-        cpy._vars_capture = self._vars_capture.copy()
+        if vars_mode == "alike":
+            # Note that this causes the local variables to be uninitialised, because the stores are
+            # not copied.  This can leave the circuit in a potentially dangerous state for users if
+            # they don't re-add initialiser stores.
+            cpy._vars_local = self._vars_local.copy()
+            cpy._vars_input = self._vars_input.copy()
+            cpy._vars_capture = self._vars_capture.copy()
+        elif vars_mode == "captures":
+            cpy._vars_local = {}
+            cpy._vars_input = {}
+            cpy._vars_capture = {var.name: var for var in self.iter_vars()}
+        elif vars_mode == "drop":
+            cpy._vars_local = {}
+            cpy._vars_input = {}
+            cpy._vars_capture = {}
+        else:  # pragma: no cover
+            raise ValueError(f"unknown vars_mode: '{vars_mode}'")
 
         cpy._parameter_table = ParameterTable()
         for parameter in getattr(cpy.global_phase, "parameters", ()):
@@ -2669,7 +2839,13 @@ def store(self, lvalue: typing.Any, rvalue: typing.Any, /) -> InstructionSet:
             :meth:`add_var`
                 Create a new variable in the circuit that can be written to with this method.
         """
-        return self.append(Store(expr.lift(lvalue), expr.lift(rvalue)), (), (), copy=False)
+        # As a convenience, lift integer-literal rvalues to the matching width.
+        lvalue = expr.lift(lvalue)
+        rvalue_type = (
+            lvalue.type if isinstance(rvalue, int) and not isinstance(rvalue, bool) else None
+        )
+        rvalue = expr.lift(rvalue, rvalue_type)
+        return self.append(Store(lvalue, rvalue), (), (), copy=False)
 
     def measure(self, qubit: QubitSpecifier, cbit: ClbitSpecifier) -> InstructionSet:
         r"""Measure a quantum bit (``qubit``) in the Z basis into a classical bit (``cbit``).
diff --git a/qiskit/compiler/assembler.py b/qiskit/compiler/assembler.py
index c7d5885506fd..a6c5212e2330 100644
--- a/qiskit/compiler/assembler.py
+++ b/qiskit/compiler/assembler.py
@@ -350,20 +350,20 @@ def _parse_common_args(
     ]
 
     # create run configuration and populate
-    run_config_dict = dict(
-        shots=shots,
-        memory=memory,
-        seed_simulator=seed_simulator,
-        init_qubits=init_qubits,
-        rep_delay=rep_delay,
-        qubit_lo_freq=qubit_lo_freq,
-        meas_lo_freq=meas_lo_freq,
-        qubit_lo_range=qubit_lo_range,
-        meas_lo_range=meas_lo_range,
-        schedule_los=schedule_los,
-        n_qubits=n_qubits,
+    run_config_dict = {
+        "shots": shots,
+        "memory": memory,
+        "seed_simulator": seed_simulator,
+        "init_qubits": init_qubits,
+        "rep_delay": rep_delay,
+        "qubit_lo_freq": qubit_lo_freq,
+        "meas_lo_freq": meas_lo_freq,
+        "qubit_lo_range": qubit_lo_range,
+        "meas_lo_range": meas_lo_range,
+        "schedule_los": schedule_los,
+        "n_qubits": n_qubits,
         **run_config,
-    )
+    }
 
     return qobj_id, qobj_header, run_config_dict
 
@@ -452,15 +452,15 @@ def _parse_pulse_args(
         parametric_pulses = getattr(backend_config, "parametric_pulses", [])
 
     # create run configuration and populate
-    run_config_dict = dict(
-        meas_level=meas_level,
-        meas_return=meas_return,
-        meas_map=meas_map,
-        memory_slot_size=memory_slot_size,
-        rep_time=rep_time,
-        parametric_pulses=parametric_pulses,
+    run_config_dict = {
+        "meas_level": meas_level,
+        "meas_return": meas_return,
+        "meas_map": meas_map,
+        "memory_slot_size": memory_slot_size,
+        "rep_time": rep_time,
+        "parametric_pulses": parametric_pulses,
         **run_config,
-    )
+    }
     run_config = RunConfig(**{k: v for k, v in run_config_dict.items() if v is not None})
 
     return run_config
@@ -478,7 +478,7 @@ def _parse_circuit_args(
     """
     parameter_binds = parameter_binds or []
     # create run configuration and populate
-    run_config_dict = dict(parameter_binds=parameter_binds, **run_config)
+    run_config_dict = {"parameter_binds": parameter_binds, **run_config}
     if parametric_pulses is None:
         if backend:
             run_config_dict["parametric_pulses"] = getattr(
diff --git a/qiskit/compiler/transpiler.py b/qiskit/compiler/transpiler.py
index 93a04b18bad7..95c583ceeaad 100644
--- a/qiskit/compiler/transpiler.py
+++ b/qiskit/compiler/transpiler.py
@@ -24,6 +24,7 @@
 from qiskit.circuit.quantumregister import Qubit
 from qiskit.dagcircuit import DAGCircuit
 from qiskit.providers.backend import Backend
+from qiskit.providers.backend_compat import BackendV2Converter
 from qiskit.providers.models import BackendProperties
 from qiskit.pulse import Schedule, InstructionScheduleMap
 from qiskit.transpiler import Layout, CouplingMap, PropertySet
@@ -316,6 +317,12 @@ def callback_func(**kwargs):
         config = user_config.get_config()
         optimization_level = config.get("transpile_optimization_level", 1)
 
+    if backend is not None and getattr(backend, "version", 0) <= 1:
+        # This is a temporary conversion step to allow for a smoother transition
+        # to a fully target-based transpiler pipeline while maintaining the behavior
+        # of `transpile` with BackendV1 inputs.
+        backend = BackendV2Converter(backend)
+
     if (
         scheduling_method is not None
         and backend is None
@@ -330,7 +337,7 @@ def callback_func(**kwargs):
 
     _skip_target = False
     _given_inst_map = bool(inst_map)  # check before inst_map is overwritten
-    # If a target is specified have it override any implicit selections from a backend
+    # If a target is specified, have it override any implicit selections from a backend
     if target is not None:
         if coupling_map is None:
             coupling_map = target.build_coupling_map()
@@ -347,7 +354,7 @@ def callback_func(**kwargs):
         if backend_properties is None:
             backend_properties = target_to_backend_properties(target)
     # If target is not specified and any hardware constraint object is
-    # manually specified then do not use the target from the backend as
+    # manually specified, do not use the target from the backend as
     # it is invalidated by a custom basis gate list, custom coupling map,
     # custom dt or custom instruction_durations
     elif (
@@ -372,6 +379,7 @@ def callback_func(**kwargs):
     _check_circuits_coupling_map(circuits, coupling_map, backend)
 
     timing_constraints = _parse_timing_constraints(backend, timing_constraints)
+    instruction_durations = _parse_instruction_durations(backend, instruction_durations, dt)
 
     if _given_inst_map and inst_map.has_custom_gate() and target is not None:
         # Do not mutate backend target
@@ -384,51 +392,6 @@ def callback_func(**kwargs):
         if translation_method is None and hasattr(backend, "get_translation_stage_plugin"):
             translation_method = backend.get_translation_stage_plugin()
 
-    if instruction_durations or dt:
-        # If durations are provided and there is more than one circuit
-        # we need to serialize the execution because the full durations
-        # is dependent on the circuit calibrations which are per circuit
-        if len(circuits) > 1:
-            out_circuits = []
-            for circuit in circuits:
-                instruction_durations = _parse_instruction_durations(
-                    backend, instruction_durations, dt, circuit
-                )
-                pm = generate_preset_pass_manager(
-                    optimization_level,
-                    backend=backend,
-                    target=target,
-                    basis_gates=basis_gates,
-                    inst_map=inst_map,
-                    coupling_map=coupling_map,
-                    instruction_durations=instruction_durations,
-                    backend_properties=backend_properties,
-                    timing_constraints=timing_constraints,
-                    initial_layout=initial_layout,
-                    layout_method=layout_method,
-                    routing_method=routing_method,
-                    translation_method=translation_method,
-                    scheduling_method=scheduling_method,
-                    approximation_degree=approximation_degree,
-                    seed_transpiler=seed_transpiler,
-                    unitary_synthesis_method=unitary_synthesis_method,
-                    unitary_synthesis_plugin_config=unitary_synthesis_plugin_config,
-                    hls_config=hls_config,
-                    init_method=init_method,
-                    optimization_method=optimization_method,
-                    _skip_target=_skip_target,
-                )
-                out_circuits.append(pm.run(circuit, callback=callback, num_processes=num_processes))
-            for name, circ in zip(output_name, out_circuits):
-                circ.name = name
-                end_time = time()
-            _log_transpile_time(start_time, end_time)
-            return out_circuits
-        else:
-            instruction_durations = _parse_instruction_durations(
-                backend, instruction_durations, dt, circuits[0]
-            )
-
     pm = generate_preset_pass_manager(
         optimization_level,
         backend=backend,
@@ -453,7 +416,7 @@ def callback_func(**kwargs):
         optimization_method=optimization_method,
         _skip_target=_skip_target,
     )
-    out_circuits = pm.run(circuits, callback=callback)
+    out_circuits = pm.run(circuits, callback=callback, num_processes=num_processes)
     for name, circ in zip(output_name, out_circuits):
         circ.name = name
     end_time = time()
@@ -471,14 +434,8 @@ def _check_circuits_coupling_map(circuits, cmap, backend):
     if cmap is not None:
         max_qubits = cmap.size()
     elif backend is not None:
-        backend_version = getattr(backend, "version", 0)
-        if backend_version <= 1:
-            if not backend.configuration().simulator:
-                max_qubits = backend.configuration().n_qubits
-            else:
-                max_qubits = None
-        else:
-            max_qubits = backend.num_qubits
+        max_qubits = backend.num_qubits
+
     for circuit in circuits:
         # If coupling_map is not None or num_qubits == 1
         num_qubits = len(circuit.qubits)
@@ -496,27 +453,15 @@ def _log_transpile_time(start_time, end_time):
 
 def _parse_inst_map(inst_map, backend):
     # try getting inst_map from user, else backend
-    if inst_map is None:
-        backend_version = getattr(backend, "version", 0)
-        if backend_version <= 1:
-            if hasattr(backend, "defaults"):
-                inst_map = getattr(backend.defaults(), "instruction_schedule_map", None)
-        else:
-            inst_map = backend.target.instruction_schedule_map()
+    if inst_map is None and backend is not None:
+        inst_map = backend.target.instruction_schedule_map()
     return inst_map
 
 
 def _parse_coupling_map(coupling_map, backend):
     # try getting coupling_map from user, else backend
-    if coupling_map is None:
-        backend_version = getattr(backend, "version", 0)
-        if backend_version <= 1:
-            if getattr(backend, "configuration", None):
-                configuration = backend.configuration()
-                if hasattr(configuration, "coupling_map") and configuration.coupling_map:
-                    coupling_map = CouplingMap(configuration.coupling_map)
-        else:
-            coupling_map = backend.coupling_map
+    if coupling_map is None and backend is not None:
+        coupling_map = backend.coupling_map
 
     # coupling_map could be None, or a list of lists, e.g. [[0, 1], [2, 1]]
     if coupling_map is None or isinstance(coupling_map, CouplingMap):
@@ -546,38 +491,20 @@ def _parse_initial_layout(initial_layout):
     return initial_layout
 
 
-def _parse_instruction_durations(backend, inst_durations, dt, circuit):
+def _parse_instruction_durations(backend, inst_durations, dt):
     """Create a list of ``InstructionDuration``s. If ``inst_durations`` is provided,
     the backend will be ignored, otherwise, the durations will be populated from the
-    backend. If any circuits have gate calibrations, those calibration durations would
-    take precedence over backend durations, but be superceded by ``inst_duration``s.
+    backend.
     """
+    final_durations = InstructionDurations()
     if not inst_durations:
-        backend_version = getattr(backend, "version", 0)
-        if backend_version <= 1:
-            backend_durations = InstructionDurations()
-            try:
-                backend_durations = InstructionDurations.from_backend(backend)
-            except AttributeError:
-                pass
-        else:
+        backend_durations = InstructionDurations()
+        if backend is not None:
             backend_durations = backend.instruction_durations
-
-    circ_durations = InstructionDurations()
-    if not inst_durations:
-        circ_durations.update(backend_durations, dt or backend_durations.dt)
-
-    if circuit.calibrations:
-        cal_durations = []
-        for gate, gate_cals in circuit.calibrations.items():
-            for (qubits, parameters), schedule in gate_cals.items():
-                cal_durations.append((gate, qubits, parameters, schedule.duration))
-        circ_durations.update(cal_durations, circ_durations.dt)
-
-    if inst_durations:
-        circ_durations.update(inst_durations, dt or getattr(inst_durations, "dt", None))
-
-    return circ_durations
+        final_durations.update(backend_durations, dt or backend_durations.dt)
+    else:
+        final_durations.update(inst_durations, dt or getattr(inst_durations, "dt", None))
+    return final_durations
 
 
 def _parse_approximation_degree(approximation_degree):
@@ -629,13 +556,6 @@ def _parse_timing_constraints(backend, timing_constraints):
         return timing_constraints
     if backend is None and timing_constraints is None:
         timing_constraints = TimingConstraints()
-    else:
-        backend_version = getattr(backend, "version", 0)
-        if backend_version <= 1:
-            if timing_constraints is None:
-                # get constraints from backend
-                timing_constraints = getattr(backend.configuration(), "timing_constraints", {})
-            timing_constraints = TimingConstraints(**timing_constraints)
-        else:
-            timing_constraints = backend.target.timing_constraints()
+    elif backend is not None:
+        timing_constraints = backend.target.timing_constraints()
     return timing_constraints
diff --git a/qiskit/converters/circuit_to_dag.py b/qiskit/converters/circuit_to_dag.py
index e2612b43d3e6..b2c1df2a037b 100644
--- a/qiskit/converters/circuit_to_dag.py
+++ b/qiskit/converters/circuit_to_dag.py
@@ -79,6 +79,13 @@ def circuit_to_dag(circuit, copy_operations=True, *, qubit_order=None, clbit_ord
     dagcircuit.add_qubits(qubits)
     dagcircuit.add_clbits(clbits)
 
+    for var in circuit.iter_input_vars():
+        dagcircuit.add_input_var(var)
+    for var in circuit.iter_captured_vars():
+        dagcircuit.add_captured_var(var)
+    for var in circuit.iter_declared_vars():
+        dagcircuit.add_declared_var(var)
+
     for register in circuit.qregs:
         dagcircuit.add_qreg(register)
 
diff --git a/qiskit/converters/circuit_to_gate.py b/qiskit/converters/circuit_to_gate.py
index 283dd87dbd71..39eed1053eb1 100644
--- a/qiskit/converters/circuit_to_gate.py
+++ b/qiskit/converters/circuit_to_gate.py
@@ -58,6 +58,8 @@ def circuit_to_gate(circuit, parameter_map=None, equivalence_library=None, label
 
     if circuit.clbits:
         raise QiskitError("Circuit with classical bits cannot be converted to gate.")
+    if circuit.num_vars:
+        raise QiskitError("circuits with realtime classical variables cannot be converted to gates")
 
     for instruction in circuit.data:
         if not _check_is_gate(instruction.operation):
diff --git a/qiskit/converters/circuit_to_instruction.py b/qiskit/converters/circuit_to_instruction.py
index e4bba13b0334..2bdcbfef3583 100644
--- a/qiskit/converters/circuit_to_instruction.py
+++ b/qiskit/converters/circuit_to_instruction.py
@@ -61,6 +61,28 @@ def circuit_to_instruction(circuit, parameter_map=None, equivalence_library=None
     # pylint: disable=cyclic-import
     from qiskit.circuit.quantumcircuit import QuantumCircuit
 
+    if circuit.num_input_vars:
+        # This could be supported by moving the `input` variables to be parameters of the
+        # instruction, but we don't really have a good reprssentation of that yet, so safer to
+        # forbid it.
+        raise QiskitError("Circuits with 'input' variables cannot yet be converted to instructions")
+    if circuit.num_captured_vars:
+        raise QiskitError("Circuits that capture variables cannot be converted to instructions")
+    if circuit.num_declared_vars:
+        # This could very easily be supported in representations, since the variables are allocated
+        # and freed within the instruction itself.  The reason to initially forbid it is to avoid
+        # needing to support unrolling such instructions within the transpiler; we would potentially
+        # need to remap variables to unique names in the larger context, and we don't yet have a way
+        # to return that information from the transpiler.  We have to catch that in the transpiler
+        # as well since a user could manually make an instruction with such a definition, but
+        # forbidding it here means users get a more meaningful error at the point that the
+        # instruction actually gets created (since users often aren't aware that
+        # `QuantumCircuit.append(QuantumCircuit)` implicitly converts to an instruction).
+        raise QiskitError(
+            "Circuits with internal variables cannot yet be converted to instructions."
+            " You may be able to use `QuantumCircuit.compose` to inline this circuit into another."
+        )
+
     if parameter_map is None:
         parameter_dict = {p: p for p in circuit.parameters}
     else:
diff --git a/qiskit/converters/dag_to_circuit.py b/qiskit/converters/dag_to_circuit.py
index 5a32f0bba1e1..ede026c247c9 100644
--- a/qiskit/converters/dag_to_circuit.py
+++ b/qiskit/converters/dag_to_circuit.py
@@ -62,7 +62,11 @@ def dag_to_circuit(dag, copy_operations=True):
         *dag.cregs.values(),
         name=name,
         global_phase=dag.global_phase,
+        inputs=dag.iter_input_vars(),
+        captures=dag.iter_captured_vars(),
     )
+    for var in dag.iter_declared_vars():
+        circuit.add_uninitialized_var(var)
     circuit.metadata = dag.metadata
     circuit.calibrations = dag.calibrations
 
diff --git a/qiskit/dagcircuit/dagcircuit.py b/qiskit/dagcircuit/dagcircuit.py
index 8c1332a8e604..838e9cfe0f86 100644
--- a/qiskit/dagcircuit/dagcircuit.py
+++ b/qiskit/dagcircuit/dagcircuit.py
@@ -22,11 +22,13 @@
 """
 from __future__ import annotations
 
-from collections import OrderedDict, defaultdict, deque, namedtuple
-from collections.abc import Callable, Sequence, Generator, Iterable
 import copy
+import enum
+import itertools
 import math
-from typing import Any
+from collections import OrderedDict, defaultdict, deque, namedtuple
+from collections.abc import Callable, Sequence, Generator, Iterable
+from typing import Any, Literal
 
 import numpy as np
 import rustworkx as rx
@@ -39,7 +41,9 @@
     SwitchCaseOp,
     _classical_resource_map,
     Operation,
+    Store,
 )
+from qiskit.circuit.classical import expr
 from qiskit.circuit.controlflow import condition_resources, node_resources, CONTROL_FLOW_OP_NAMES
 from qiskit.circuit.quantumregister import QuantumRegister, Qubit
 from qiskit.circuit.classicalregister import ClassicalRegister, Clbit
@@ -52,6 +56,8 @@
 from qiskit.pulse import Schedule
 
 BitLocations = namedtuple("BitLocations", ("index", "registers"))
+# The allowable arguments to :meth:`DAGCircuit.copy_empty_like`'s ``vars_mode``.
+_VarsMode = Literal["alike", "captures", "drop"]
 
 
 class DAGCircuit:
@@ -78,13 +84,24 @@ def __init__(self):
         # Cache of dag op node sort keys
         self._key_cache = {}
 
-        # Set of wires (Register,idx) in the dag
+        # Set of wire data in the DAG.  A wire is an owned unit of data.  Qubits are the primary
+        # wire type (and the only data that has _true_ wire properties from a read/write
+        # perspective), but clbits and classical `Var`s are too.  Note: classical registers are
+        # _not_ wires because the individual bits are the more fundamental unit.  We treat `Var`s
+        # as the entire wire (as opposed to individual bits of them) for scalability reasons; if a
+        # parametric program wants to parametrize over 16-bit angles, we can't scale to 1000s of
+        # those by tracking all 16 bits individually.
+        #
+        # Classical variables shouldn't be "wires"; it should be possible to have multiple reads
+        # without implying ordering.  The initial addition of the classical variables uses the
+        # existing wire structure as an MVP; we expect to handle this better in a new version of the
+        # transpiler IR that also handles control flow more properly.
         self._wires = set()
 
-        # Map from wire (Register,idx) to input nodes of the graph
+        # Map from wire to input nodes of the graph
         self.input_map = OrderedDict()
 
-        # Map from wire (Register,idx) to output nodes of the graph
+        # Map from wire to output nodes of the graph
         self.output_map = OrderedDict()
 
         # Directed multigraph whose nodes are inputs, outputs, or operations.
@@ -92,7 +109,7 @@ def __init__(self):
         # additional data about the operation, including the argument order
         # and parameter values.
         # Input nodes have out-degree 1 and output nodes have in-degree 1.
-        # Edges carry wire labels (reg,idx) and each operation has
+        # Edges carry wire labels and each operation has
         # corresponding in- and out-edges with the same wire labels.
         self._multi_graph = rx.PyDAG()
 
@@ -110,6 +127,16 @@ def __init__(self):
         # its index within that register.
         self._qubit_indices: dict[Qubit, BitLocations] = {}
         self._clbit_indices: dict[Clbit, BitLocations] = {}
+        # Tracking for the classical variables used in the circuit.  This contains the information
+        # needed to insert new nodes.  This is keyed by the name rather than the `Var` instance
+        # itself so we can ensure we don't allow shadowing or redefinition of names.
+        self._vars_info: dict[str, _DAGVarInfo] = {}
+        # Convenience stateful tracking for the individual types of nodes to allow things like
+        # comparisons between circuits to take place without needing to disambiguate the
+        # graph-specific usage information.
+        self._vars_by_type: dict[_DAGVarType, set[expr.Var]] = {
+            type_: set() for type_ in _DAGVarType
+        }
 
         self._global_phase: float | ParameterExpression = 0.0
         self._calibrations: dict[str, dict[tuple, Schedule]] = defaultdict(dict)
@@ -122,7 +149,11 @@ def __init__(self):
     @property
     def wires(self):
         """Return a list of the wires in order."""
-        return self.qubits + self.clbits
+        return (
+            self.qubits
+            + self.clbits
+            + [var for vars in self._vars_by_type.values() for var in vars]
+        )
 
     @property
     def node_counter(self):
@@ -297,6 +328,57 @@ def add_creg(self, creg):
                 )
                 self._add_wire(creg[j])
 
+    def add_input_var(self, var: expr.Var):
+        """Add an input variable to the circuit.
+
+        Args:
+            var: the variable to add."""
+        if self._vars_by_type[_DAGVarType.CAPTURE]:
+            raise DAGCircuitError("cannot add inputs to a circuit with captures")
+        self._add_var(var, _DAGVarType.INPUT)
+
+    def add_captured_var(self, var: expr.Var):
+        """Add a captured variable to the circuit.
+
+        Args:
+            var: the variable to add."""
+        if self._vars_by_type[_DAGVarType.INPUT]:
+            raise DAGCircuitError("cannot add captures to a circuit with inputs")
+        self._add_var(var, _DAGVarType.CAPTURE)
+
+    def add_declared_var(self, var: expr.Var):
+        """Add a declared local variable to the circuit.
+
+        Args:
+            var: the variable to add."""
+        self._add_var(var, _DAGVarType.DECLARE)
+
+    def _add_var(self, var: expr.Var, type_: _DAGVarType):
+        """Inner function to add any variable to the DAG.  ``location`` should be a reference one of
+        the ``self._vars_*`` tracking dictionaries.
+        """
+        # The setup of the initial graph structure between an "in" and an "out" node is the same as
+        # the bit-related `_add_wire`, but this logically needs to do different bookkeeping around
+        # tracking the properties.
+        if not var.standalone:
+            raise DAGCircuitError(
+                "cannot add variables that wrap `Clbit` or `ClassicalRegister` instances"
+            )
+        if (previous := self._vars_info.get(var.name, None)) is not None:
+            if previous.var == var:
+                raise DAGCircuitError(f"'{var}' is already present in the circuit")
+            raise DAGCircuitError(
+                f"cannot add '{var}' as its name shadows the existing '{previous.var}'"
+            )
+        in_node = DAGInNode(wire=var)
+        out_node = DAGOutNode(wire=var)
+        in_node._node_id, out_node._node_id = self._multi_graph.add_nodes_from((in_node, out_node))
+        self._multi_graph.add_edge(in_node._node_id, out_node._node_id, var)
+        self.input_map[var] = in_node
+        self.output_map[var] = out_node
+        self._vars_by_type[type_].add(var)
+        self._vars_info[var.name] = _DAGVarInfo(var, type_, in_node, out_node)
+
     def _add_wire(self, wire):
         """Add a qubit or bit to the circuit.
 
@@ -543,14 +625,14 @@ def _check_condition(self, name, condition):
         if not set(resources.clbits).issubset(self.clbits):
             raise DAGCircuitError(f"invalid clbits in condition for {name}")
 
-    def _check_bits(self, args, amap):
-        """Check the values of a list of (qu)bit arguments.
+    def _check_wires(self, args: Iterable[Bit | expr.Var], amap: dict[Bit | expr.Var, Any]):
+        """Check the values of a list of wire arguments.
 
         For each element of args, check that amap contains it.
 
         Args:
-            args (list[Bit]): the elements to be checked
-            amap (dict): a dictionary keyed on Qubits/Clbits
+            args: the elements to be checked
+            amap: a dictionary keyed on Qubits/Clbits
 
         Raises:
             DAGCircuitError: if a qubit is not contained in amap
@@ -558,46 +640,7 @@ def _check_bits(self, args, amap):
         # Check for each wire
         for wire in args:
             if wire not in amap:
-                raise DAGCircuitError(f"(qu)bit {wire} not found in {amap}")
-
-    @staticmethod
-    def _bits_in_operation(operation):
-        """Return an iterable over the classical bits that are inherent to an instruction.  This
-        includes a `condition`, or the `target` of a :class:`.ControlFlowOp`.
-
-        Args:
-            instruction: the :class:`~.circuit.Instruction` instance for a node.
-
-        Returns:
-            Iterable[Clbit]: the :class:`.Clbit`\\ s involved.
-        """
-        # If updating this, also update the fast-path checker `DAGCirucit._operation_may_have_bits`.
-        if (condition := getattr(operation, "condition", None)) is not None:
-            yield from condition_resources(condition).clbits
-        if isinstance(operation, SwitchCaseOp):
-            target = operation.target
-            if isinstance(target, Clbit):
-                yield target
-            elif isinstance(target, ClassicalRegister):
-                yield from target
-            else:
-                yield from node_resources(target).clbits
-
-    @staticmethod
-    def _operation_may_have_bits(operation) -> bool:
-        """Return whether a given :class:`.Operation` may contain any :class:`.Clbit` instances
-        in itself (e.g. a control-flow operation).
-
-        Args:
-            operation (qiskit.circuit.Operation): the operation to check.
-        """
-        # This is separate to `_bits_in_operation` because most of the time there won't be any bits,
-        # so we want a fast path to be able to skip creating and testing a generator for emptiness.
-        #
-        # If updating this, also update `DAGCirucit._bits_in_operation`.
-        return getattr(operation, "condition", None) is not None or isinstance(
-            operation, SwitchCaseOp
-        )
+                raise DAGCircuitError(f"wire {wire} not found in {amap}")
 
     def _increment_op(self, op):
         if op.name in self._op_names:
@@ -611,14 +654,32 @@ def _decrement_op(self, op):
         else:
             self._op_names[op.name] -= 1
 
-    def copy_empty_like(self):
+    def copy_empty_like(self, *, vars_mode: _VarsMode = "alike"):
         """Return a copy of self with the same structure but empty.
 
         That structure includes:
             * name and other metadata
             * global phase
             * duration
-            * all the qubits and clbits, including the registers.
+            * all the qubits and clbits, including the registers
+            * all the classical variables, with a mode defined by ``vars_mode``.
+
+        Args:
+            vars_mode: The mode to handle realtime variables in.
+
+                alike
+                    The variables in the output DAG will have the same declaration semantics as
+                    in the original circuit.  For example, ``input`` variables in the source will be
+                    ``input`` variables in the output DAG.
+
+                captures
+                    All variables will be converted to captured variables.  This is useful when you
+                    are building a new layer for an existing DAG that you will want to
+                    :meth:`compose` onto the base, since :meth:`compose` can inline captures onto
+                    the base circuit (but not other variables).
+
+                drop
+                    The output DAG will have no variables defined.
 
         Returns:
             DAGCircuit: An empty copy of self.
@@ -639,6 +700,21 @@ def copy_empty_like(self):
         for creg in self.cregs.values():
             target_dag.add_creg(creg)
 
+        if vars_mode == "alike":
+            for var in self.iter_input_vars():
+                target_dag.add_input_var(var)
+            for var in self.iter_captured_vars():
+                target_dag.add_captured_var(var)
+            for var in self.iter_declared_vars():
+                target_dag.add_declared_var(var)
+        elif vars_mode == "captures":
+            for var in self.iter_vars():
+                target_dag.add_captured_var(var)
+        elif vars_mode == "drop":
+            pass
+        else:  # pragma: no cover
+            raise ValueError(f"unknown vars_mode: '{vars_mode}'")
+
         return target_dag
 
     def apply_operation_back(
@@ -669,17 +745,17 @@ def apply_operation_back(
         """
         qargs = tuple(qargs)
         cargs = tuple(cargs)
+        additional = ()
 
-        if self._operation_may_have_bits(op):
+        if _may_have_additional_wires(op):
             # This is the slow path; most of the time, this won't happen.
-            all_cbits = set(self._bits_in_operation(op)).union(cargs)
-        else:
-            all_cbits = cargs
+            additional = set(_additional_wires(op)).difference(cargs)
 
         if check:
             self._check_condition(op.name, getattr(op, "condition", None))
-            self._check_bits(qargs, self.output_map)
-            self._check_bits(all_cbits, self.output_map)
+            self._check_wires(qargs, self.output_map)
+            self._check_wires(cargs, self.output_map)
+            self._check_wires(additional, self.output_map)
 
         node = DAGOpNode(op=op, qargs=qargs, cargs=cargs, dag=self)
         node._node_id = self._multi_graph.add_node(node)
@@ -690,7 +766,7 @@ def apply_operation_back(
         # and adding new edges from the operation node to each output node
         self._multi_graph.insert_node_on_in_edges_multiple(
             node._node_id,
-            [self.output_map[bit]._node_id for bits in (qargs, all_cbits) for bit in bits],
+            [self.output_map[bit]._node_id for bits in (qargs, cargs, additional) for bit in bits],
         )
         return node
 
@@ -721,17 +797,17 @@ def apply_operation_front(
         """
         qargs = tuple(qargs)
         cargs = tuple(cargs)
+        additional = ()
 
-        if self._operation_may_have_bits(op):
+        if _may_have_additional_wires(op):
             # This is the slow path; most of the time, this won't happen.
-            all_cbits = set(self._bits_in_operation(op)).union(cargs)
-        else:
-            all_cbits = cargs
+            additional = set(_additional_wires(op)).difference(cargs)
 
         if check:
             self._check_condition(op.name, getattr(op, "condition", None))
-            self._check_bits(qargs, self.input_map)
-            self._check_bits(all_cbits, self.input_map)
+            self._check_wires(qargs, self.output_map)
+            self._check_wires(cargs, self.output_map)
+            self._check_wires(additional, self.output_map)
 
         node = DAGOpNode(op=op, qargs=qargs, cargs=cargs, dag=self)
         node._node_id = self._multi_graph.add_node(node)
@@ -742,11 +818,13 @@ def apply_operation_front(
         # and adding new edges to the operation node from each input node
         self._multi_graph.insert_node_on_out_edges_multiple(
             node._node_id,
-            [self.input_map[bit]._node_id for bits in (qargs, all_cbits) for bit in bits],
+            [self.input_map[bit]._node_id for bits in (qargs, cargs, additional) for bit in bits],
         )
         return node
 
-    def compose(self, other, qubits=None, clbits=None, front=False, inplace=True):
+    def compose(
+        self, other, qubits=None, clbits=None, front=False, inplace=True, *, inline_captures=False
+    ):
         """Compose the ``other`` circuit onto the output of this circuit.
 
         A subset of input wires of ``other`` are mapped
@@ -760,6 +838,18 @@ def compose(self, other, qubits=None, clbits=None, front=False, inplace=True):
             clbits (list[Clbit|int]): clbits of self to compose onto.
             front (bool): If True, front composition will be performed (not implemented yet)
             inplace (bool): If True, modify the object. Otherwise return composed circuit.
+            inline_captures (bool): If ``True``, variables marked as "captures" in the ``other`` DAG
+                will inlined onto existing uses of those same variables in ``self``.  If ``False``,
+                all variables in ``other`` are required to be distinct from ``self``, and they will
+                be added to ``self``.
+
+        ..
+            Note: unlike `QuantumCircuit.compose`, there's no `var_remap` argument here.  That's
+            because the `DAGCircuit` inner-block structure isn't set up well to allow the recursion,
+            and `DAGCircuit.compose` is generally only used to rebuild a DAG from layers within
+            itself than to join unrelated circuits.  While there's no strong motivating use-case
+            (unlike the `QuantumCircuit` equivalent), it's safer and more performant to not provide
+            the option.
 
         Returns:
             DAGCircuit: the composed dag (returns None if inplace==True).
@@ -822,27 +912,52 @@ def compose(self, other, qubits=None, clbits=None, front=False, inplace=True):
         for gate, cals in other.calibrations.items():
             dag._calibrations[gate].update(cals)
 
+        # This is all the handling we need for realtime variables, if there's no remapping. They:
+        #
+        # * get added to the DAG and then operations involving them get appended on normally.
+        # * get inlined onto an existing variable, then operations get appended normally.
+        # * there's a clash or a failed inlining, and we just raise an error.
+        #
+        # Notably if there's no remapping, there's no need to recurse into control-flow or to do any
+        # Var rewriting during the Expr visits.
+        for var in other.iter_input_vars():
+            dag.add_input_var(var)
+        if inline_captures:
+            for var in other.iter_captured_vars():
+                if not dag.has_var(var):
+                    raise DAGCircuitError(
+                        f"Variable '{var}' to be inlined is not in the base DAG."
+                        " If you wanted it to be automatically added, use `inline_captures=False`."
+                    )
+        else:
+            for var in other.iter_captured_vars():
+                dag.add_captured_var(var)
+        for var in other.iter_declared_vars():
+            dag.add_declared_var(var)
+
         # Ensure that the error raised here is a `DAGCircuitError` for backwards compatibility.
         def _reject_new_register(reg):
             raise DAGCircuitError(f"No register with '{reg.bits}' to map this expression onto.")
 
         variable_mapper = _classical_resource_map.VariableMapper(
-            dag.cregs.values(), edge_map, _reject_new_register
+            dag.cregs.values(), edge_map, add_register=_reject_new_register
         )
         for nd in other.topological_nodes():
             if isinstance(nd, DAGInNode):
-                # if in edge_map, get new name, else use existing name
-                m_wire = edge_map.get(nd.wire, nd.wire)
-                # the mapped wire should already exist
-                if m_wire not in dag.output_map:
-                    raise DAGCircuitError(
-                        "wire %s[%d] not in self" % (m_wire.register.name, m_wire.index)
-                    )
-                if nd.wire not in other._wires:
-                    raise DAGCircuitError(
-                        "inconsistent wire type for %s[%d] in other"
-                        % (nd.register.name, nd.wire.index)
-                    )
+                if isinstance(nd.wire, Bit):
+                    # if in edge_map, get new name, else use existing name
+                    m_wire = edge_map.get(nd.wire, nd.wire)
+                    # the mapped wire should already exist
+                    if m_wire not in dag.output_map:
+                        raise DAGCircuitError(
+                            "wire %s[%d] not in self" % (m_wire.register.name, m_wire.index)
+                        )
+                    if nd.wire not in other._wires:
+                        raise DAGCircuitError(
+                            "inconsistent wire type for %s[%d] in other"
+                            % (nd.register.name, nd.wire.index)
+                        )
+                # If it's a Var wire, we already checked that it exists in the destination.
             elif isinstance(nd, DAGOutNode):
                 # ignore output nodes
                 pass
@@ -1030,6 +1145,52 @@ def num_tensor_factors(self):
         """Compute how many components the circuit can decompose into."""
         return rx.number_weakly_connected_components(self._multi_graph)
 
+    @property
+    def num_vars(self):
+        """Total number of classical variables tracked by the circuit."""
+        return len(self._vars_info)
+
+    @property
+    def num_input_vars(self):
+        """Number of input classical variables tracked by the circuit."""
+        return len(self._vars_by_type[_DAGVarType.INPUT])
+
+    @property
+    def num_captured_vars(self):
+        """Number of captured classical variables tracked by the circuit."""
+        return len(self._vars_by_type[_DAGVarType.CAPTURE])
+
+    @property
+    def num_declared_vars(self):
+        """Number of declared local classical variables tracked by the circuit."""
+        return len(self._vars_by_type[_DAGVarType.DECLARE])
+
+    def iter_vars(self):
+        """Iterable over all the classical variables tracked by the circuit."""
+        return itertools.chain.from_iterable(self._vars_by_type.values())
+
+    def iter_input_vars(self):
+        """Iterable over the input classical variables tracked by the circuit."""
+        return iter(self._vars_by_type[_DAGVarType.INPUT])
+
+    def iter_captured_vars(self):
+        """Iterable over the captured classical variables tracked by the circuit."""
+        return iter(self._vars_by_type[_DAGVarType.CAPTURE])
+
+    def iter_declared_vars(self):
+        """Iterable over the declared local classical variables tracked by the circuit."""
+        return iter(self._vars_by_type[_DAGVarType.DECLARE])
+
+    def has_var(self, var: str | expr.Var) -> bool:
+        """Is this realtime variable in the DAG?
+
+        Args:
+            var: the variable or name to check.
+        """
+        if isinstance(var, str):
+            return var in self._vars_info
+        return (info := self._vars_info.get(var.name, False)) and info.var is var
+
     def __eq__(self, other):
         # Try to convert to float, but in case of unbound ParameterExpressions
         # a TypeError will be raise, fallback to normal equality in those
@@ -1047,6 +1208,11 @@ def __eq__(self, other):
         if self.calibrations != other.calibrations:
             return False
 
+        # We don't do any semantic equivalence between Var nodes, as things stand; DAGs can only be
+        # equal in our mind if they use the exact same UUID vars.
+        if self._vars_by_type != other._vars_by_type:
+            return False
+
         self_bit_indices = {bit: idx for idx, bit in enumerate(self.qubits + self.clbits)}
         other_bit_indices = {bit: idx for idx, bit in enumerate(other.qubits + other.clbits)}
 
@@ -1130,7 +1296,8 @@ def replace_block_with_op(
                 multiple gates in the combined single op node.  If a :class:`.Bit` is not in the
                 dictionary, it will not be added to the args; this can be useful when dealing with
                 control-flow operations that have inherent bits in their ``condition`` or ``target``
-                fields.
+                fields.  :class:`.expr.Var` wires similarly do not need to be in this map, since
+                they will never be in ``qargs`` or ``cargs``.
             cycle_check (bool): When set to True this method will check that
                 replacing the provided ``node_block`` with a single node
                 would introduce a cycle (which would invalidate the
@@ -1197,12 +1364,22 @@ def substitute_node_with_dag(self, node, input_dag, wires=None, propagate_condit
 
         Args:
             node (DAGOpNode): node to substitute
-            input_dag (DAGCircuit): circuit that will substitute the node
+            input_dag (DAGCircuit): circuit that will substitute the node.
             wires (list[Bit] | Dict[Bit, Bit]): gives an order for (qu)bits
                 in the input circuit. If a list, then the bits refer to those in the ``input_dag``,
                 and the order gets matched to the node wires by qargs first, then cargs, then
                 conditions.  If a dictionary, then a mapping of bits in the ``input_dag`` to those
                 that the ``node`` acts on.
+
+                Standalone :class:`~.expr.Var` nodes cannot currently be remapped as part of the
+                substitution; the ``input_dag`` should be defined over the correct set of variables
+                already.
+
+                ..
+                    The rule about not remapping `Var`s is to avoid performance pitfalls and reduce
+                    complexity; the creator of the input DAG should easily be able to arrange for
+                    the correct `Var`s to be used, and doing so avoids us needing to recurse through
+                    control-flow operations to do deep remappings.
             propagate_condition (bool): If ``True`` (default), then any ``condition`` attribute on
                 the operation within ``node`` is propagated to each node in the ``input_dag``.  If
                 ``False``, then the ``input_dag`` is assumed to faithfully implement suitable
@@ -1227,9 +1404,9 @@ def substitute_node_with_dag(self, node, input_dag, wires=None, propagate_condit
             node_wire_order = list(node.qargs) + list(node.cargs)
             # If we're not propagating it, the number of wires in the input DAG should include the
             # condition as well.
-            if not propagate_condition and self._operation_may_have_bits(node.op):
+            if not propagate_condition and _may_have_additional_wires(node.op):
                 node_wire_order += [
-                    bit for bit in self._bits_in_operation(node.op) if bit not in node_cargs
+                    wire for wire in _additional_wires(node.op) if wire not in node_cargs
                 ]
             if len(wires) != len(node_wire_order):
                 raise DAGCircuitError(
@@ -1241,12 +1418,27 @@ def substitute_node_with_dag(self, node, input_dag, wires=None, propagate_condit
         for input_dag_wire, our_wire in wire_map.items():
             if our_wire not in self.input_map:
                 raise DAGCircuitError(f"bit mapping invalid: {our_wire} is not in this DAG")
+            if isinstance(our_wire, expr.Var) or isinstance(input_dag_wire, expr.Var):
+                raise DAGCircuitError("`Var` nodes cannot be remapped during substitution")
             # Support mapping indiscriminately between Qubit and AncillaQubit, etc.
             check_type = Qubit if isinstance(our_wire, Qubit) else Clbit
             if not isinstance(input_dag_wire, check_type):
                 raise DAGCircuitError(
                     f"bit mapping invalid: {input_dag_wire} and {our_wire} are different bit types"
                 )
+        if _may_have_additional_wires(node.op):
+            node_vars = {var for var in _additional_wires(node.op) if isinstance(var, expr.Var)}
+        else:
+            node_vars = set()
+        dag_vars = set(input_dag.iter_vars())
+        if dag_vars - node_vars:
+            raise DAGCircuitError(
+                "Cannot replace a node with a DAG with more variables."
+                f" Variables in node: {node_vars}."
+                f" Variables in DAG: {dag_vars}."
+            )
+        for var in dag_vars:
+            wire_map[var] = var
 
         reverse_wire_map = {b: a for a, b in wire_map.items()}
         # It doesn't make sense to try and propagate a condition from a control-flow op; a
@@ -1325,14 +1517,22 @@ def substitute_node_with_dag(self, node, input_dag, wires=None, propagate_condit
                     node._node_id, lambda edge, wire=self_wire: edge == wire
                 )[0]
                 self._multi_graph.add_edge(pred._node_id, succ._node_id, self_wire)
+        for contracted_var in node_vars - dag_vars:
+            pred = self._multi_graph.find_predecessors_by_edge(
+                node._node_id, lambda edge, wire=contracted_var: edge == wire
+            )[0]
+            succ = self._multi_graph.find_successors_by_edge(
+                node._node_id, lambda edge, wire=contracted_var: edge == wire
+            )[0]
+            self._multi_graph.add_edge(pred._node_id, succ._node_id, contracted_var)
 
         # Exlude any nodes from in_dag that are not a DAGOpNode or are on
-        # bits outside the set specified by the wires kwarg
+        # wires outside the set specified by the wires kwarg
         def filter_fn(node):
             if not isinstance(node, DAGOpNode):
                 return False
-            for qarg in node.qargs:
-                if qarg not in wire_map:
+            for _, _, wire in in_dag.edges(node):
+                if wire not in wire_map:
                     return False
             return True
 
@@ -1369,7 +1569,7 @@ def edge_weight_map(wire):
         self._decrement_op(node.op)
 
         variable_mapper = _classical_resource_map.VariableMapper(
-            self.cregs.values(), wire_map, self.add_creg
+            self.cregs.values(), wire_map, add_register=self.add_creg
         )
         # Iterate over nodes of input_circuit and update wires in node objects migrated
         # from in_dag
@@ -1441,21 +1641,12 @@ def substitute_node(self, node: DAGOpNode, op, inplace: bool = False, propagate_
         # This might include wires that are inherent to the node, like in its `condition` or
         # `target` fields, so might be wider than `node.op.num_{qu,cl}bits`.
         current_wires = {wire for _, _, wire in self.edges(node)}
-        new_wires = set(node.qargs) | set(node.cargs)
-        if (new_condition := getattr(op, "condition", None)) is not None:
-            new_wires.update(condition_resources(new_condition).clbits)
-        elif isinstance(op, SwitchCaseOp):
-            if isinstance(op.target, Clbit):
-                new_wires.add(op.target)
-            elif isinstance(op.target, ClassicalRegister):
-                new_wires.update(op.target)
-            else:
-                new_wires.update(node_resources(op.target).clbits)
+        new_wires = set(node.qargs) | set(node.cargs) | set(_additional_wires(op))
 
         if propagate_condition and not (
             isinstance(node.op, ControlFlowOp) or isinstance(op, ControlFlowOp)
         ):
-            if new_condition is not None:
+            if getattr(op, "condition", None) is not None:
                 raise DAGCircuitError(
                     "Cannot propagate a condition to an operation that already has one."
                 )
@@ -1491,13 +1682,17 @@ def substitute_node(self, node: DAGOpNode, op, inplace: bool = False, propagate_
             self._decrement_op(node.op)
         return new_node
 
-    def separable_circuits(self, remove_idle_qubits: bool = False) -> list["DAGCircuit"]:
+    def separable_circuits(
+        self, remove_idle_qubits: bool = False, *, vars_mode: _VarsMode = "alike"
+    ) -> list["DAGCircuit"]:
         """Decompose the circuit into sets of qubits with no gates connecting them.
 
         Args:
             remove_idle_qubits (bool): Flag denoting whether to remove idle qubits from
                 the separated circuits. If ``False``, each output circuit will contain the
                 same number of qubits as ``self``.
+            vars_mode: how any realtime :class:`~.expr.Var` nodes should be handled in the output
+                DAGs.  See :meth:`copy_empty_like` for details on the modes.
 
         Returns:
             List[DAGCircuit]: The circuits resulting from separating ``self`` into sets
@@ -1522,7 +1717,7 @@ def _key(x):
         # Create new DAGCircuit objects from each of the rustworkx subgraph objects
         decomposed_dags = []
         for subgraph in disconnected_subgraphs:
-            new_dag = self.copy_empty_like()
+            new_dag = self.copy_empty_like(vars_mode=vars_mode)
             new_dag.global_phase = 0
             subgraph_is_classical = True
             for node in rx.lexicographical_topological_sort(subgraph, key=_key):
@@ -1706,7 +1901,7 @@ def classical_predecessors(self, node):
         connected by a classical edge as DAGOpNodes and DAGInNodes."""
         return iter(
             self._multi_graph.find_predecessors_by_edge(
-                node._node_id, lambda edge_data: isinstance(edge_data, Clbit)
+                node._node_id, lambda edge_data: not isinstance(edge_data, Qubit)
             )
         )
 
@@ -1739,7 +1934,7 @@ def classical_successors(self, node):
         connected by a classical edge as DAGOpNodes and DAGInNodes."""
         return iter(
             self._multi_graph.find_successors_by_edge(
-                node._node_id, lambda edge_data: isinstance(edge_data, Clbit)
+                node._node_id, lambda edge_data: not isinstance(edge_data, Qubit)
             )
         )
 
@@ -1804,7 +1999,7 @@ def front_layer(self):
 
         return op_nodes
 
-    def layers(self):
+    def layers(self, *, vars_mode: _VarsMode = "captures"):
         """Yield a shallow view on a layer of this DAGCircuit for all d layers of this circuit.
 
         A layer is a circuit whose gates act on disjoint qubits, i.e.,
@@ -1821,6 +2016,10 @@ def layers(self):
         TODO: Gates that use the same cbits will end up in different
         layers as this is currently implemented. This may not be
         the desired behavior.
+
+        Args:
+            vars_mode: how any realtime :class:`~.expr.Var` nodes should be handled in the output
+                DAGs.  See :meth:`copy_empty_like` for details on the modes.
         """
         graph_layers = self.multigraph_layers()
         try:
@@ -1845,7 +2044,7 @@ def layers(self):
                 return
 
             # Construct a shallow copy of self
-            new_layer = self.copy_empty_like()
+            new_layer = self.copy_empty_like(vars_mode=vars_mode)
 
             for node in op_nodes:
                 # this creates new DAGOpNodes in the new_layer
@@ -1860,14 +2059,18 @@ def layers(self):
 
             yield {"graph": new_layer, "partition": support_list}
 
-    def serial_layers(self):
+    def serial_layers(self, *, vars_mode: _VarsMode = "captures"):
         """Yield a layer for all gates of this circuit.
 
         A serial layer is a circuit with one gate. The layers have the
         same structure as in layers().
+
+        Args:
+            vars_mode: how any realtime :class:`~.expr.Var` nodes should be handled in the output
+                DAGs.  See :meth:`copy_empty_like` for details on the modes.
         """
         for next_node in self.topological_op_nodes():
-            new_layer = self.copy_empty_like()
+            new_layer = self.copy_empty_like(vars_mode=vars_mode)
 
             # Save the support of the operation we add to the layer
             support_list = []
@@ -2123,3 +2326,82 @@ def draw(self, scale=0.7, filename=None, style="color"):
         from qiskit.visualization.dag_visualization import dag_drawer
 
         return dag_drawer(dag=self, scale=scale, filename=filename, style=style)
+
+
+class _DAGVarType(enum.Enum):
+    INPUT = enum.auto()
+    CAPTURE = enum.auto()
+    DECLARE = enum.auto()
+
+
+class _DAGVarInfo:
+    __slots__ = ("var", "type", "in_node", "out_node")
+
+    def __init__(self, var: expr.Var, type_: _DAGVarType, in_node: DAGInNode, out_node: DAGOutNode):
+        self.var = var
+        self.type = type_
+        self.in_node = in_node
+        self.out_node = out_node
+
+
+def _may_have_additional_wires(operation) -> bool:
+    """Return whether a given :class:`.Operation` may contain references to additional wires
+    locations within itself.  If this is ``False``, it doesn't necessarily mean that the operation
+    _will_ access memory inherently, but a ``True`` return guarantees that it won't.
+
+    The memory might be classical bits or classical variables, such as a control-flow operation or a
+    store.
+
+    Args:
+        operation (qiskit.circuit.Operation): the operation to check.
+    """
+    # This is separate to `_additional_wires` because most of the time there won't be any extra
+    # wires beyond the explicit `qargs` and `cargs` so we want a fast path to be able to skip
+    # creating and testing a generator for emptiness.
+    #
+    # If updating this, you most likely also need to update `_additional_wires`.
+    return getattr(operation, "condition", None) is not None or isinstance(
+        operation, (ControlFlowOp, Store)
+    )
+
+
+def _additional_wires(operation) -> Iterable[Clbit | expr.Var]:
+    """Return an iterable over the additional tracked memory usage in this operation.  These
+    additional wires include (for example, non-exhaustive) bits referred to by a ``condition`` or
+    the classical variables involved in control-flow operations.
+
+    Args:
+        operation: the :class:`~.circuit.Operation` instance for a node.
+
+    Returns:
+        Iterable: the additional wires inherent to this operation.
+    """
+    # If updating this, you likely need to update `_may_have_additional_wires` too.
+    if (condition := getattr(operation, "condition", None)) is not None:
+        if isinstance(condition, expr.Expr):
+            yield from _wires_from_expr(condition)
+        else:
+            yield from condition_resources(condition).clbits
+    if isinstance(operation, ControlFlowOp):
+        yield from operation.iter_captured_vars()
+        if isinstance(operation, SwitchCaseOp):
+            target = operation.target
+            if isinstance(target, Clbit):
+                yield target
+            elif isinstance(target, ClassicalRegister):
+                yield from target
+            else:
+                yield from _wires_from_expr(target)
+    elif isinstance(operation, Store):
+        yield from _wires_from_expr(operation.lvalue)
+        yield from _wires_from_expr(operation.rvalue)
+
+
+def _wires_from_expr(node: expr.Expr) -> Iterable[Clbit | expr.Var]:
+    for var in expr.iter_vars(node):
+        if isinstance(var.var, Clbit):
+            yield var.var
+        elif isinstance(var.var, ClassicalRegister):
+            yield from var.var
+        else:
+            yield var
diff --git a/qiskit/primitives/__init__.py b/qiskit/primitives/__init__.py
index 0a36dbbb0bd3..2423f3545f80 100644
--- a/qiskit/primitives/__init__.py
+++ b/qiskit/primitives/__init__.py
@@ -86,17 +86,17 @@
     estimator = Estimator()
 
     # calculate [  ]
-    job = estimator.run([(psi1, hamiltonian1, [theta1])])
+    job = estimator.run([(psi1, H1, [theta1])])
     job_result = job.result() # It will block until the job finishes.
-    print(f"The primitive-job finished with result {job_result}"))
+    print(f"The primitive-job finished with result {job_result}")
 
     # calculate [ [,
     #              ],
     #             [] ]
     job2 = estimator.run(
         [
-            (psi1, [hamiltonian1, hamiltonian3], [theta1, theta3]), 
-            (psi2, hamiltonian2, theta2)
+            (psi1, [H1, H3], [theta1, theta3]), 
+            (psi2, H2, theta2)
         ],
         precision=0.01
     )
diff --git a/qiskit/primitives/backend_estimator_v2.py b/qiskit/primitives/backend_estimator_v2.py
index 525b223b9505..9afc6d892f3d 100644
--- a/qiskit/primitives/backend_estimator_v2.py
+++ b/qiskit/primitives/backend_estimator_v2.py
@@ -14,10 +14,10 @@
 
 from __future__ import annotations
 
+import math
 from collections import defaultdict
 from collections.abc import Iterable
 from dataclasses import dataclass
-import math
 
 import numpy as np
 
@@ -25,6 +25,7 @@
 from qiskit.exceptions import QiskitError
 from qiskit.providers import BackendV1, BackendV2
 from qiskit.quantum_info import Pauli, PauliList
+from qiskit.result import Counts
 from qiskit.transpiler import PassManager, PassManagerConfig
 from qiskit.transpiler.passes import Optimize1qGatesDecomposition
 
@@ -56,6 +57,20 @@ class Options:
     """
 
 
+@dataclass
+class _PreprocessedData:
+    """Internal data structure to store the results of the preprocessing of a pub."""
+
+    circuits: list[QuantumCircuit]
+    """The quantum circuits generated by binding parameters of the pub's circuit."""
+
+    parameter_indices: np.ndarray
+    """The indices of the pub's bindings array broadcast to the shape of the pub."""
+
+    observables: np.ndarray
+    """The pub's observable array broadcast to the shape of the pub."""
+
+
 class BackendEstimatorV2(BaseEstimatorV2):
     """Evaluates expectation values for provided quantum circuit and observable combinations
 
@@ -144,10 +159,58 @@ def _validate_pubs(self, pubs: list[EstimatorPub]):
                 )
 
     def _run(self, pubs: list[EstimatorPub]) -> PrimitiveResult[PubResult]:
-        return PrimitiveResult([self._run_pub(pub) for pub in pubs])
+        pub_dict = defaultdict(list)
+        # consolidate pubs with the same number of shots
+        for i, pub in enumerate(pubs):
+            shots = int(math.ceil(1.0 / pub.precision**2))
+            pub_dict[shots].append(i)
+
+        results = [None] * len(pubs)
+        for shots, lst in pub_dict.items():
+            # run pubs with the same number of shots at once
+            pub_results = self._run_pubs([pubs[i] for i in lst], shots)
+            # reconstruct the result of pubs
+            for i, pub_result in zip(lst, pub_results):
+                results[i] = pub_result
+        return PrimitiveResult(results)
+
+    def _run_pubs(self, pubs: list[EstimatorPub], shots: int) -> list[PubResult]:
+        """Compute results for pubs that all require the same value of ``shots``."""
+        preprocessed_data = []
+        flat_circuits = []
+        for pub in pubs:
+            data = self._preprocess_pub(pub)
+            preprocessed_data.append(data)
+            flat_circuits.extend(data.circuits)
+
+        run_result, metadata = _run_circuits(
+            flat_circuits, self._backend, shots=shots, seed_simulator=self._options.seed_simulator
+        )
+        counts = _prepare_counts(run_result)
 
-    def _run_pub(self, pub: EstimatorPub) -> PubResult:
-        shots = math.ceil(1.0 / pub.precision**2)
+        results = []
+        start = 0
+        for pub, data in zip(pubs, preprocessed_data):
+            end = start + len(data.circuits)
+            expval_map = self._calc_expval_map(counts[start:end], metadata[start:end])
+            start = end
+            results.append(self._postprocess_pub(pub, expval_map, data, shots))
+        return results
+
+    def _preprocess_pub(self, pub: EstimatorPub) -> _PreprocessedData:
+        """Converts a pub into a list of bound circuits necessary to estimate all its observables.
+
+        The circuits contain metadata explaining which bindings array index they are with respect to,
+        and which measurement basis they are measuring.
+
+        Args:
+            pub: The pub to preprocess.
+
+        Returns:
+            The values ``(circuits, bc_param_ind, bc_obs)`` where ``circuits`` are the circuits to
+            execute on the backend, ``bc_param_ind`` are indices of the pub's bindings array and
+            ``bc_obs`` is the observables array, both broadcast to the shape of the pub.
+        """
         circuit = pub.circuit
         observables = pub.observables
         parameter_values = pub.parameter_values
@@ -161,10 +224,29 @@ def _run_pub(self, pub: EstimatorPub) -> PubResult:
         param_obs_map = defaultdict(set)
         for index in np.ndindex(*bc_param_ind.shape):
             param_index = bc_param_ind[index]
-            param_obs_map[param_index].update(bc_obs[index].keys())
-        expval_map = self._calc_expval_paulis(circuit, parameter_values, param_obs_map, shots)
+            param_obs_map[param_index].update(bc_obs[index])
+
+        bound_circuits = self._bind_and_add_measurements(circuit, parameter_values, param_obs_map)
+        return _PreprocessedData(bound_circuits, bc_param_ind, bc_obs)
+
+    def _postprocess_pub(
+        self, pub: EstimatorPub, expval_map: dict, data: _PreprocessedData, shots: int
+    ) -> PubResult:
+        """Computes expectation values (evs) and standard errors (stds).
+
+        The values are stored in arrays broadcast to the shape of the pub.
 
-        # calculate expectation values (evs) and standard errors (stds)
+        Args:
+            pub: The pub to postprocess.
+            expval_map: The map
+            data: The result data of the preprocessing.
+            shots: The number of shots.
+
+        Returns:
+            The pub result.
+        """
+        bc_param_ind = data.parameter_indices
+        bc_obs = data.observables
         evs = np.zeros_like(bc_param_ind, dtype=float)
         variances = np.zeros_like(bc_param_ind, dtype=float)
         for index in np.ndindex(*bc_param_ind.shape):
@@ -178,30 +260,55 @@ def _run_pub(self, pub: EstimatorPub) -> PubResult:
         data_bin = data_bin_cls(evs=evs, stds=stds)
         return PubResult(data_bin, metadata={"target_precision": pub.precision})
 
-    def _calc_expval_paulis(
+    def _bind_and_add_measurements(
         self,
         circuit: QuantumCircuit,
         parameter_values: BindingsArray,
         param_obs_map: dict[tuple[int, ...], set[str]],
-        shots: int,
-    ) -> dict[tuple[tuple[int, ...], str], tuple[float, float]]:
-        # generate circuits
+    ) -> list[QuantumCircuit]:
+        """Bind the given circuit against each parameter value set, and add necessary measurements
+        to each.
+
+        Args:
+            circuit: The (possibly parametric) circuit of interest.
+            parameter_values: An array of parameter value sets that can be applied to the circuit.
+            param_obs_map: A mapping from locations in ``parameter_values`` to a sets of
+                Pauli terms whose expectation values are required in those locations.
+
+        Returns:
+            A flat list of circuits sufficient to measure all Pauli terms in the ``param_obs_map``
+            values at the corresponding ``parameter_values`` location, where requisite
+            book-keeping is stored as circuit metadata.
+        """
         circuits = []
         for param_index, pauli_strings in param_obs_map.items():
             bound_circuit = parameter_values.bind(circuit, param_index)
             # sort pauli_strings so that the order is deterministic
             meas_paulis = PauliList(sorted(pauli_strings))
-            new_circuits = self._preprocessing(bound_circuit, meas_paulis, param_index)
+            new_circuits = self._create_measurement_circuits(
+                bound_circuit, meas_paulis, param_index
+            )
             circuits.extend(new_circuits)
+        return circuits
 
-        # run circuits
-        result, metadata = _run_circuits(
-            circuits, self._backend, shots=shots, seed_simulator=self._options.seed_simulator
-        )
+    def _calc_expval_map(
+        self,
+        counts: list[Counts],
+        metadata: dict,
+    ) -> dict[tuple[tuple[int, ...], str], tuple[float, float]]:
+        """Computes the map of expectation values.
 
-        # postprocessing results
+        Args:
+            counts: The counts data.
+            metadata: The metadata.
+
+        Returns:
+            The map of expectation values takes a pair of an index of the bindings array and
+            a pauli string as a key and returns the expectation value of the pauli string
+            with the the pub's circuit bound against the parameter value set in the index of
+            the bindings array.
+        """
         expval_map: dict[tuple[tuple[int, ...], str], tuple[float, float]] = {}
-        counts = _prepare_counts(result)
         for count, meta in zip(counts, metadata):
             orig_paulis = meta["orig_paulis"]
             meas_paulis = meta["meas_paulis"]
@@ -211,10 +318,23 @@ def _calc_expval_paulis(
                 expval_map[param_index, pauli.to_label()] = (expval, variance)
         return expval_map
 
-    def _preprocessing(
+    def _create_measurement_circuits(
         self, circuit: QuantumCircuit, observable: PauliList, param_index: tuple[int, ...]
     ) -> list[QuantumCircuit]:
-        # generate measurement circuits with metadata
+        """Generate a list of circuits sufficient to estimate each of the given Paulis.
+
+        Paulis are divided into qubitwise-commuting subsets to reduce the total circuit count.
+        Metadata is attached to circuits in order to remember what each one measures, and
+        where it belongs in the output.
+
+        Args:
+            circuit: The circuit of interest.
+            observable: Which Pauli terms we would like to observe.
+            param_index: Where to put the data we estimate (only passed to metadata).
+
+        Returns:
+            A list of circuits sufficient to estimate each of the given Paulis.
+        """
         meas_circuits: list[QuantumCircuit] = []
         if self._options.abelian_grouping:
             for obs in observable.group_commuting(qubit_wise=True):
diff --git a/qiskit/primitives/backend_sampler_v2.py b/qiskit/primitives/backend_sampler_v2.py
index 23861aa34f45..87507e1d54d0 100644
--- a/qiskit/primitives/backend_sampler_v2.py
+++ b/qiskit/primitives/backend_sampler_v2.py
@@ -15,6 +15,7 @@
 from __future__ import annotations
 
 import warnings
+from collections import defaultdict
 from dataclasses import dataclass
 from typing import Iterable
 
@@ -141,37 +142,77 @@ def _validate_pubs(self, pubs: list[SamplerPub]):
                     UserWarning,
                 )
 
-    def _run(self, pubs: Iterable[SamplerPub]) -> PrimitiveResult[PubResult]:
-        results = [self._run_pub(pub) for pub in pubs]
+    def _run(self, pubs: list[SamplerPub]) -> PrimitiveResult[PubResult]:
+        pub_dict = defaultdict(list)
+        # consolidate pubs with the same number of shots
+        for i, pub in enumerate(pubs):
+            pub_dict[pub.shots].append(i)
+
+        results = [None] * len(pubs)
+        for shots, lst in pub_dict.items():
+            # run pubs with the same number of shots at once
+            pub_results = self._run_pubs([pubs[i] for i in lst], shots)
+            # reconstruct the result of pubs
+            for i, pub_result in zip(lst, pub_results):
+                results[i] = pub_result
         return PrimitiveResult(results)
 
-    def _run_pub(self, pub: SamplerPub) -> PubResult:
-        meas_info, max_num_bytes = _analyze_circuit(pub.circuit)
-        bound_circuits = pub.parameter_values.bind_all(pub.circuit)
-        arrays = {
-            item.creg_name: np.zeros(
-                bound_circuits.shape + (pub.shots, item.num_bytes), dtype=np.uint8
-            )
-            for item in meas_info
-        }
-        flatten_circuits = np.ravel(bound_circuits).tolist()
-        result_memory, _ = _run_circuits(
+    def _run_pubs(self, pubs: list[SamplerPub], shots: int) -> list[PubResult]:
+        """Compute results for pubs that all require the same value of ``shots``."""
+        # prepare circuits
+        bound_circuits = [pub.parameter_values.bind_all(pub.circuit) for pub in pubs]
+        flatten_circuits = []
+        for circuits in bound_circuits:
+            flatten_circuits.extend(np.ravel(circuits).tolist())
+
+        # run circuits
+        results, _ = _run_circuits(
             flatten_circuits,
             self._backend,
             memory=True,
-            shots=pub.shots,
+            shots=shots,
             seed_simulator=self._options.seed_simulator,
         )
-        memory_list = _prepare_memory(result_memory, max_num_bytes)
+        result_memory = _prepare_memory(results)
+
+        # pack memory to an ndarray of uint8
+        results = []
+        start = 0
+        for pub, bound in zip(pubs, bound_circuits):
+            meas_info, max_num_bytes = _analyze_circuit(pub.circuit)
+            end = start + bound.size
+            results.append(
+                self._postprocess_pub(
+                    result_memory[start:end], shots, bound.shape, meas_info, max_num_bytes
+                )
+            )
+            start = end
+
+        return results
 
-        for samples, index in zip(memory_list, np.ndindex(*bound_circuits.shape)):
+    def _postprocess_pub(
+        self,
+        result_memory: list[list[str]],
+        shots: int,
+        shape: tuple[int, ...],
+        meas_info: list[_MeasureInfo],
+        max_num_bytes: int,
+    ) -> PubResult:
+        """Converts the memory data into an array of bit arrays with the shape of the pub."""
+        arrays = {
+            item.creg_name: np.zeros(shape + (shots, item.num_bytes), dtype=np.uint8)
+            for item in meas_info
+        }
+        memory_array = _memory_array(result_memory, max_num_bytes)
+
+        for samples, index in zip(memory_array, np.ndindex(*shape)):
             for item in meas_info:
                 ary = _samples_to_packed_array(samples, item.num_bits, item.start)
                 arrays[item.creg_name][index] = ary
 
         data_bin_cls = make_data_bin(
             [(item.creg_name, BitArray) for item in meas_info],
-            shape=bound_circuits.shape,
+            shape=shape,
         )
         meas = {
             item.creg_name: BitArray(arrays[item.creg_name], item.num_bits) for item in meas_info
@@ -181,6 +222,7 @@ def _run_pub(self, pub: SamplerPub) -> PubResult:
 
 
 def _analyze_circuit(circuit: QuantumCircuit) -> tuple[list[_MeasureInfo], int]:
+    """Analyzes the information for each creg in a circuit."""
     meas_info = []
     max_num_bits = 0
     for creg in circuit.cregs:
@@ -202,24 +244,38 @@ def _analyze_circuit(circuit: QuantumCircuit) -> tuple[list[_MeasureInfo], int]:
     return meas_info, _min_num_bytes(max_num_bits)
 
 
-def _prepare_memory(results: list[Result], num_bytes: int) -> NDArray[np.uint8]:
+def _prepare_memory(results: list[Result]) -> list[list[str]]:
+    """Joins splitted results if exceeding max_experiments"""
     lst = []
     for res in results:
         for exp in res.results:
             if hasattr(exp.data, "memory") and exp.data.memory:
-                data = b"".join(int(i, 16).to_bytes(num_bytes, "big") for i in exp.data.memory)
-                data = np.frombuffer(data, dtype=np.uint8).reshape(-1, num_bytes)
+                lst.append(exp.data.memory)
             else:
                 # no measure in a circuit
-                data = np.zeros((exp.shots, num_bytes), dtype=np.uint8)
-            lst.append(data)
-    ary = np.array(lst, copy=False)
+                lst.append(["0x0"] * exp.shots)
+    return lst
+
+
+def _memory_array(results: list[list[str]], num_bytes: int) -> NDArray[np.uint8]:
+    """Converts the memory data into an array in an unpacked way."""
+    lst = []
+    for memory in results:
+        if num_bytes > 0:
+            data = b"".join(int(i, 16).to_bytes(num_bytes, "big") for i in memory)
+            data = np.frombuffer(data, dtype=np.uint8).reshape(-1, num_bytes)
+        else:
+            # no measure in a circuit
+            data = np.zeros((len(memory), num_bytes), dtype=np.uint8)
+        lst.append(data)
+    ary = np.asarray(lst)
     return np.unpackbits(ary, axis=-1, bitorder="big")
 
 
 def _samples_to_packed_array(
     samples: NDArray[np.uint8], num_bits: int, start: int
 ) -> NDArray[np.uint8]:
+    """Converts an unpacked array of the memory data into a packed array."""
     # samples of `Backend.run(memory=True)` will be the order of
     # clbit_last, ..., clbit_1, clbit_0
     # place samples in the order of clbit_start+num_bits-1, ..., clbit_start+1, clbit_start
diff --git a/qiskit/primitives/containers/observables_array.py b/qiskit/primitives/containers/observables_array.py
index 0d0322dc6a3c..21c415d75899 100644
--- a/qiskit/primitives/containers/observables_array.py
+++ b/qiskit/primitives/containers/observables_array.py
@@ -101,10 +101,10 @@ def tolist(self) -> list:
         """Convert to a nested list"""
         return self._array.tolist()
 
-    def __array__(self, dtype=None):
+    def __array__(self, dtype=None, copy=None):
         """Convert to an Numpy.ndarray"""
         if dtype is None or dtype == object:
-            return self._array
+            return self._array.copy() if copy else self._array
         raise ValueError("Type must be 'None' or 'object'")
 
     @overload
diff --git a/qiskit/primitives/containers/shape.py b/qiskit/primitives/containers/shape.py
index 952916cd67dc..6d893f46c13f 100644
--- a/qiskit/primitives/containers/shape.py
+++ b/qiskit/primitives/containers/shape.py
@@ -85,7 +85,7 @@ def array_coerce(arr: ArrayLike | Shaped) -> NDArray | Shaped:
     """
     if isinstance(arr, Shaped):
         return arr
-    return np.array(arr, copy=False)
+    return np.asarray(arr)
 
 
 def _flatten_to_ints(arg: ShapeInput) -> Iterable[int]:
diff --git a/qiskit/providers/backend_compat.py b/qiskit/providers/backend_compat.py
index de57f3f09fa2..e567c330a958 100644
--- a/qiskit/providers/backend_compat.py
+++ b/qiskit/providers/backend_compat.py
@@ -1,6 +1,6 @@
 # This code is part of Qiskit.
 #
-# (C) Copyright IBM 2020.
+# (C) Copyright IBM 2020, 2024.
 #
 # This code is licensed under the Apache License, Version 2.0. You may
 # obtain a copy of this license in the LICENSE.txt file in the root directory
@@ -57,7 +57,7 @@ def convert_to_target(
         A ``Target`` instance.
     """
 
-    # importing pacakges where they are needed, to avoid cyclic-import.
+    # importing packages where they are needed, to avoid cyclic-import.
     # pylint: disable=cyclic-import
     from qiskit.transpiler.target import (
         Target,
@@ -82,7 +82,7 @@ def convert_to_target(
         "switch_case": SwitchCaseOp,
     }
 
-    in_data = {"num_qubits": configuration.n_qubits}
+    in_data = {"num_qubits": configuration.num_qubits}
 
     # Parse global configuration properties
     if hasattr(configuration, "dt"):
@@ -97,7 +97,6 @@ def convert_to_target(
     all_instructions = set.union(
         basis_gates, set(required), supported_instructions.intersection(CONTROL_FLOW_OP_NAMES)
     )
-
     inst_name_map = {}  # type: Dict[str, Instruction]
 
     faulty_ops = set()
@@ -244,10 +243,8 @@ def _get_value(prop_dict, prop_name):
 
         for name in inst_sched_map.instructions:
             for qubits in inst_sched_map.qubits_with_instruction(name):
-
                 if not isinstance(qubits, tuple):
                     qubits = (qubits,)
-
                 if (
                     name not in all_instructions
                     or name not in prop_name_map
@@ -267,17 +264,18 @@ def _get_value(prop_dict, prop_name):
                     continue
 
                 entry = inst_sched_map._get_calibration_entry(name, qubits)
-
                 try:
                     prop_name_map[name][qubits].calibration = entry
                 except AttributeError:
+                    # if instruction properties are "None", add entry
+                    prop_name_map[name].update({qubits: InstructionProperties(None, None, entry)})
                     logger.info(
                         "The PulseDefaults payload received contains an instruction %s on "
-                        "qubits %s which is not present in the configuration or properties payload.",
+                        "qubits %s which is not present in the configuration or properties payload."
+                        "A new properties entry will be added to include the new calibration data.",
                         name,
                         qubits,
                     )
-
     # Add parsed properties to target
     target = Target(**in_data)
     for inst_name in all_instructions:
@@ -384,7 +382,7 @@ def __init__(
         super().__init__(
             provider=backend.provider,
             name=backend.name(),
-            description=self._config.description,
+            description=getattr(self._config, "description", None),
             online_date=getattr(self._config, "online_date", None),
             backend_version=self._config.backend_version,
         )
diff --git a/qiskit/providers/basic_provider/basic_provider_tools.py b/qiskit/providers/basic_provider/basic_provider_tools.py
index 030c629275ed..b2670cc0977f 100644
--- a/qiskit/providers/basic_provider/basic_provider_tools.py
+++ b/qiskit/providers/basic_provider/basic_provider_tools.py
@@ -23,7 +23,30 @@
 from qiskit.exceptions import QiskitError
 
 # Single qubit gates supported by ``single_gate_params``.
-SINGLE_QUBIT_GATES = ("U", "u", "h", "p", "u1", "u2", "u3", "rz", "sx", "x")
+SINGLE_QUBIT_GATES = {
+    "U": gates.UGate,
+    "u": gates.UGate,
+    "u1": gates.U1Gate,
+    "u2": gates.U2Gate,
+    "u3": gates.U3Gate,
+    "h": gates.HGate,
+    "p": gates.PhaseGate,
+    "s": gates.SGate,
+    "sdg": gates.SdgGate,
+    "sx": gates.SXGate,
+    "sxdg": gates.SXdgGate,
+    "t": gates.TGate,
+    "tdg": gates.TdgGate,
+    "x": gates.XGate,
+    "y": gates.YGate,
+    "z": gates.ZGate,
+    "id": gates.IGate,
+    "i": gates.IGate,
+    "r": gates.RGate,
+    "rx": gates.RXGate,
+    "ry": gates.RYGate,
+    "rz": gates.RZGate,
+}
 
 
 def single_gate_matrix(gate: str, params: list[float] | None = None) -> np.ndarray:
@@ -40,42 +63,55 @@ def single_gate_matrix(gate: str, params: list[float] | None = None) -> np.ndarr
     """
     if params is None:
         params = []
-
-    if gate == "U":
-        gc = gates.UGate
-    elif gate == "u3":
-        gc = gates.U3Gate
-    elif gate == "h":
-        gc = gates.HGate
-    elif gate == "u":
-        gc = gates.UGate
-    elif gate == "p":
-        gc = gates.PhaseGate
-    elif gate == "u2":
-        gc = gates.U2Gate
-    elif gate == "u1":
-        gc = gates.U1Gate
-    elif gate == "rz":
-        gc = gates.RZGate
-    elif gate == "id":
-        gc = gates.IGate
-    elif gate == "sx":
-        gc = gates.SXGate
-    elif gate == "x":
-        gc = gates.XGate
+    if gate in SINGLE_QUBIT_GATES:
+        gc = SINGLE_QUBIT_GATES[gate]
     else:
         raise QiskitError("Gate is not a valid basis gate for this simulator: %s" % gate)
 
     return gc(*params).to_matrix()
 
 
-# Cache CX matrix as no parameters.
-_CX_MATRIX = gates.CXGate().to_matrix()
-
-
-def cx_gate_matrix() -> np.ndarray:
-    """Get the matrix for a controlled-NOT gate."""
-    return _CX_MATRIX
+# Two qubit gates WITHOUT parameters: name -> matrix
+TWO_QUBIT_GATES = {
+    "CX": gates.CXGate().to_matrix(),
+    "cx": gates.CXGate().to_matrix(),
+    "ecr": gates.ECRGate().to_matrix(),
+    "cy": gates.CYGate().to_matrix(),
+    "cz": gates.CZGate().to_matrix(),
+    "swap": gates.SwapGate().to_matrix(),
+    "iswap": gates.iSwapGate().to_matrix(),
+    "ch": gates.CHGate().to_matrix(),
+    "cs": gates.CSGate().to_matrix(),
+    "csdg": gates.CSdgGate().to_matrix(),
+    "csx": gates.CSXGate().to_matrix(),
+    "dcx": gates.DCXGate().to_matrix(),
+}
+
+# Two qubit gates WITH parameters: name -> class
+TWO_QUBIT_GATES_WITH_PARAMETERS = {
+    "cp": gates.CPhaseGate,
+    "crx": gates.CRXGate,
+    "cry": gates.CRYGate,
+    "crz": gates.CRZGate,
+    "cu": gates.CUGate,
+    "cu1": gates.CU1Gate,
+    "cu3": gates.CU3Gate,
+    "rxx": gates.RXXGate,
+    "ryy": gates.RYYGate,
+    "rzz": gates.RZZGate,
+    "rzx": gates.RZXGate,
+    "xx_minus_yy": gates.XXMinusYYGate,
+    "xx_plus_yy": gates.XXPlusYYGate,
+}
+
+
+# Three qubit gates: name -> matrix
+THREE_QUBIT_GATES = {
+    "ccx": gates.CCXGate().to_matrix(),
+    "ccz": gates.CCZGate().to_matrix(),
+    "rccx": gates.RCCXGate().to_matrix(),
+    "cswap": gates.CSwapGate().to_matrix(),
+}
 
 
 def einsum_matmul_index(gate_indices: list[int], number_of_qubits: int) -> str:
diff --git a/qiskit/providers/basic_provider/basic_simulator.py b/qiskit/providers/basic_provider/basic_simulator.py
index e19021519194..b03a8df7ae5a 100644
--- a/qiskit/providers/basic_provider/basic_simulator.py
+++ b/qiskit/providers/basic_provider/basic_simulator.py
@@ -40,7 +40,7 @@
 
 from qiskit.circuit import QuantumCircuit
 from qiskit.circuit.library import UnitaryGate
-from qiskit.circuit.library.standard_gates import get_standard_gate_name_mapping
+from qiskit.circuit.library.standard_gates import get_standard_gate_name_mapping, GlobalPhaseGate
 from qiskit.providers import Provider
 from qiskit.providers.backend import BackendV2
 from qiskit.providers.models import BackendConfiguration
@@ -51,8 +51,12 @@
 
 from .basic_provider_job import BasicProviderJob
 from .basic_provider_tools import single_gate_matrix
-from .basic_provider_tools import SINGLE_QUBIT_GATES
-from .basic_provider_tools import cx_gate_matrix
+from .basic_provider_tools import (
+    SINGLE_QUBIT_GATES,
+    TWO_QUBIT_GATES,
+    TWO_QUBIT_GATES_WITH_PARAMETERS,
+    THREE_QUBIT_GATES,
+)
 from .basic_provider_tools import einsum_vecmul_index
 from .exceptions import BasicProviderError
 
@@ -138,21 +142,59 @@ def _build_basic_target(self) -> Target:
             num_qubits=None,
         )
         basis_gates = [
+            "ccx",
+            "ccz",
+            "ch",
+            "cp",
+            "crx",
+            "cry",
+            "crz",
+            "cs",
+            "csdg",
+            "cswap",
+            "csx",
+            "cu",
+            "cu1",
+            "cu3",
+            "cx",
+            "cy",
+            "cz",
+            "dcx",
+            "delay",
+            "ecr",
+            "global_phase",
             "h",
-            "u",
+            "id",
+            "iswap",
+            "measure",
             "p",
+            "r",
+            "rccx",
+            "reset",
+            "rx",
+            "rxx",
+            "ry",
+            "ryy",
+            "rz",
+            "rzx",
+            "rzz",
+            "s",
+            "sdg",
+            "swap",
+            "sx",
+            "sxdg",
+            "t",
+            "tdg",
+            "u",
             "u1",
             "u2",
             "u3",
-            "rz",
-            "sx",
-            "x",
-            "cx",
-            "id",
             "unitary",
-            "measure",
-            "delay",
-            "reset",
+            "x",
+            "xx_minus_yy",
+            "xx_plus_yy",
+            "y",
+            "z",
         ]
         inst_mapping = get_standard_gate_name_mapping()
         for name in basis_gates:
@@ -617,24 +659,41 @@ def run_experiment(self, experiment: QasmQobjExperiment) -> dict[str, ...]:
                             value >>= 1
                         if value != int(operation.conditional.val, 16):
                             continue
-                # Check if single  gate
                 if operation.name == "unitary":
                     qubits = operation.qubits
                     gate = operation.params[0]
                     self._add_unitary(gate, qubits)
+                elif operation.name in ("id", "u0", "delay"):
+                    pass
+                elif operation.name == "global_phase":
+                    params = getattr(operation, "params", None)
+                    gate = GlobalPhaseGate(*params).to_matrix()
+                    self._add_unitary(gate, [])
+                # Check if single qubit gate
                 elif operation.name in SINGLE_QUBIT_GATES:
                     params = getattr(operation, "params", None)
                     qubit = operation.qubits[0]
                     gate = single_gate_matrix(operation.name, params)
                     self._add_unitary(gate, [qubit])
-                # Check if CX gate
+                elif operation.name in TWO_QUBIT_GATES_WITH_PARAMETERS:
+                    params = getattr(operation, "params", None)
+                    qubit0 = operation.qubits[0]
+                    qubit1 = operation.qubits[1]
+                    gate = TWO_QUBIT_GATES_WITH_PARAMETERS[operation.name](*params).to_matrix()
+                    self._add_unitary(gate, [qubit0, qubit1])
                 elif operation.name in ("id", "u0"):
                     pass
-                elif operation.name in ("CX", "cx"):
+                elif operation.name in TWO_QUBIT_GATES:
                     qubit0 = operation.qubits[0]
                     qubit1 = operation.qubits[1]
-                    gate = cx_gate_matrix()
+                    gate = TWO_QUBIT_GATES[operation.name]
                     self._add_unitary(gate, [qubit0, qubit1])
+                elif operation.name in THREE_QUBIT_GATES:
+                    qubit0 = operation.qubits[0]
+                    qubit1 = operation.qubits[1]
+                    qubit2 = operation.qubits[2]
+                    gate = THREE_QUBIT_GATES[operation.name]
+                    self._add_unitary(gate, [qubit0, qubit1, qubit2])
                 # Check if reset
                 elif operation.name == "reset":
                     qubit = operation.qubits[0]
diff --git a/qiskit/providers/fake_provider/fake_1q.py b/qiskit/providers/fake_provider/fake_1q.py
index 07589476149e..09959620bc92 100644
--- a/qiskit/providers/fake_provider/fake_1q.py
+++ b/qiskit/providers/fake_provider/fake_1q.py
@@ -32,7 +32,7 @@ def __init__(self):
         configuration = BackendProperties(
             backend_name="fake_1q",
             backend_version="0.0.0",
-            n_qubits=1,
+            num_qubits=1,
             basis_gates=["u1", "u2", "u3", "cx"],
             simulator=False,
             local=True,
diff --git a/qiskit/pulse/parameter_manager.py b/qiskit/pulse/parameter_manager.py
index 561eac01f55d..e5a4a1a1d2bd 100644
--- a/qiskit/pulse/parameter_manager.py
+++ b/qiskit/pulse/parameter_manager.py
@@ -54,7 +54,7 @@
 from copy import copy
 from typing import Any, Mapping, Sequence
 
-from qiskit.circuit import ParameterVector
+from qiskit.circuit.parametervector import ParameterVector, ParameterVectorElement
 from qiskit.circuit.parameter import Parameter
 from qiskit.circuit.parameterexpression import ParameterExpression, ParameterValueType
 from qiskit.pulse import instructions, channels
@@ -62,7 +62,11 @@
 from qiskit.pulse.library import SymbolicPulse, Waveform
 from qiskit.pulse.schedule import Schedule, ScheduleBlock
 from qiskit.pulse.transforms.alignments import AlignmentKind
-from qiskit.pulse.utils import format_parameter_value
+from qiskit.pulse.utils import (
+    format_parameter_value,
+    _validate_parameter_vector,
+    _validate_parameter_value,
+)
 
 
 class NodeVisitor:
@@ -362,7 +366,8 @@ def assign_parameters(
         self,
         pulse_program: Any,
         value_dict: dict[
-            ParameterExpression | ParameterVector, ParameterValueType | Sequence[ParameterValueType]
+            ParameterExpression | ParameterVector | str,
+            ParameterValueType | Sequence[ParameterValueType],
         ],
     ) -> Any:
         """Modify and return program data with parameters assigned according to the input.
@@ -397,7 +402,7 @@ def update_parameter_table(self, new_node: Any):
     def _unroll_param_dict(
         self,
         parameter_binds: Mapping[
-            Parameter | ParameterVector, ParameterValueType | Sequence[ParameterValueType]
+            Parameter | ParameterVector | str, ParameterValueType | Sequence[ParameterValueType]
         ],
     ) -> Mapping[Parameter, ParameterValueType]:
         """
@@ -410,21 +415,31 @@ def _unroll_param_dict(
             A dictionary from parameter to value.
         """
         out = {}
+        param_name_dict = {param.name: [] for param in self.parameters}
+        for param in self.parameters:
+            param_name_dict[param.name].append(param)
+        param_vec_dict = {
+            param.vector.name: param.vector
+            for param in self.parameters
+            if isinstance(param, ParameterVectorElement)
+        }
+        for name in param_vec_dict.keys():
+            if name in param_name_dict:
+                param_name_dict[name].append(param_vec_dict[name])
+            else:
+                param_name_dict[name] = [param_vec_dict[name]]
+
         for parameter, value in parameter_binds.items():
             if isinstance(parameter, ParameterVector):
-                if not isinstance(value, Sequence):
-                    raise PulseError(
-                        f"Parameter vector '{parameter.name}' has length {len(parameter)},"
-                        f" but was assigned to a single value."
-                    )
-                if len(parameter) != len(value):
-                    raise PulseError(
-                        f"Parameter vector '{parameter.name}' has length {len(parameter)},"
-                        f" but was assigned to {len(value)} values."
-                    )
+                _validate_parameter_vector(parameter, value)
                 out.update(zip(parameter, value))
             elif isinstance(parameter, str):
-                out[self.get_parameters(parameter)] = value
+                for param in param_name_dict[parameter]:
+                    is_vec = _validate_parameter_value(param, value)
+                    if is_vec:
+                        out.update(zip(param, value))
+                    else:
+                        out[param] = value
             else:
                 out[parameter] = value
         return out
diff --git a/qiskit/pulse/schedule.py b/qiskit/pulse/schedule.py
index 2a32b06a4780..5241da0c31d1 100644
--- a/qiskit/pulse/schedule.py
+++ b/qiskit/pulse/schedule.py
@@ -53,6 +53,7 @@
 from qiskit.pulse.utils import instruction_duration_validation
 from qiskit.pulse.reference_manager import ReferenceManager
 from qiskit.utils.multiprocessing import is_main_process
+from qiskit.utils import deprecate_arg
 
 
 Interval = Tuple[int, int]
@@ -714,16 +715,17 @@ def is_parameterized(self) -> bool:
     def assign_parameters(
         self,
         value_dict: dict[
-            ParameterExpression | ParameterVector, ParameterValueType | Sequence[ParameterValueType]
+            ParameterExpression | ParameterVector | str,
+            ParameterValueType | Sequence[ParameterValueType],
         ],
         inplace: bool = True,
     ) -> "Schedule":
         """Assign the parameters in this schedule according to the input.
 
         Args:
-            value_dict: A mapping from parameters (parameter vectors) to either
-            numeric values (list of numeric values)
-            or another Parameter expression (list of Parameter expressions).
+            value_dict: A mapping from parameters or parameter names (parameter vector
+            or parameter vector name) to either numeric values (list of numeric values)
+            or another parameter expression (list of parameter expressions).
             inplace: Set ``True`` to override this instance with new parameter.
 
         Returns:
@@ -1415,15 +1417,16 @@ def is_referenced(self) -> bool:
     def assign_parameters(
         self,
         value_dict: dict[
-            ParameterExpression | ParameterVector, ParameterValueType | Sequence[ParameterValueType]
+            ParameterExpression | ParameterVector | str,
+            ParameterValueType | Sequence[ParameterValueType],
         ],
         inplace: bool = True,
     ) -> "ScheduleBlock":
         """Assign the parameters in this schedule according to the input.
 
         Args:
-            value_dict: A mapping from parameters (parameter vectors) to either numeric values
-            (list of numeric values)
+            value_dict: A mapping from parameters or parameter names (parameter vector
+            or parameter vector name) to either numeric values (list of numeric values)
             or another parameter expression (list of parameter expressions).
             inplace: Set ``True`` to override this instance with new parameter.
 
@@ -1643,6 +1646,7 @@ def wrapper(*args, **kwargs):
     return decorator
 
 
+@deprecate_arg("show_barriers", new_alias="plot_barriers", since="1.1.0", pending=True)
 @_common_method(Schedule, ScheduleBlock)
 def draw(
     self,
@@ -1654,9 +1658,10 @@ def draw(
     show_snapshot: bool = True,
     show_framechange: bool = True,
     show_waveform_info: bool = True,
-    show_barrier: bool = True,
+    plot_barrier: bool = True,
     plotter: str = "mpl2d",
     axis: Any | None = None,
+    show_barrier: bool = True,
 ):
     """Plot the schedule.
 
@@ -1668,16 +1673,16 @@ def draw(
             preset stylesheets.
         backend (Optional[BaseBackend]): Backend object to play the input pulse program.
             If provided, the plotter may use to make the visualization hardware aware.
-        time_range: Set horizontal axis limit. Tuple `(tmin, tmax)`.
-        time_unit: The unit of specified time range either `dt` or `ns`.
-            The unit of `ns` is available only when `backend` object is provided.
+        time_range: Set horizontal axis limit. Tuple ``(tmin, tmax)``.
+        time_unit: The unit of specified time range either ``dt`` or ``ns``.
+            The unit of `ns` is available only when ``backend`` object is provided.
         disable_channels: A control property to show specific pulse channel.
             Pulse channel instances provided as a list are not shown in the output image.
         show_snapshot: Show snapshot instructions.
         show_framechange: Show frame change instructions. The frame change represents
             instructions that modulate phase or frequency of pulse channels.
         show_waveform_info: Show additional information about waveforms such as their name.
-        show_barrier: Show barrier lines.
+        plot_barrier: Show barrier lines.
         plotter: Name of plotter API to generate an output image.
             One of following APIs should be specified::
 
@@ -1690,6 +1695,7 @@ def draw(
             the plotters use a given ``axis`` instead of internally initializing
             a figure object. This object format depends on the plotter.
             See plotter argument for details.
+        show_barrier: DEPRECATED. Show barrier lines.
 
     Returns:
         Visualization output data.
@@ -1699,6 +1705,7 @@ def draw(
     # pylint: disable=cyclic-import
     from qiskit.visualization import pulse_drawer
 
+    del show_barrier
     return pulse_drawer(
         program=self,
         style=style,
@@ -1709,7 +1716,7 @@ def draw(
         show_snapshot=show_snapshot,
         show_framechange=show_framechange,
         show_waveform_info=show_waveform_info,
-        show_barrier=show_barrier,
+        plot_barrier=plot_barrier,
         plotter=plotter,
         axis=axis,
     )
diff --git a/qiskit/pulse/utils.py b/qiskit/pulse/utils.py
index fddc9469add9..ae87fbafadde 100644
--- a/qiskit/pulse/utils.py
+++ b/qiskit/pulse/utils.py
@@ -11,13 +11,14 @@
 # that they have been altered from the originals.
 
 """Module for common pulse programming utilities."""
-from typing import List, Dict, Union
+from typing import List, Dict, Union, Sequence
 import warnings
 
 import numpy as np
 
+from qiskit.circuit import ParameterVector, Parameter
 from qiskit.circuit.parameterexpression import ParameterExpression
-from qiskit.pulse.exceptions import UnassignedDurationError, QiskitError
+from qiskit.pulse.exceptions import UnassignedDurationError, QiskitError, PulseError
 
 
 def format_meas_map(meas_map: List[List[int]]) -> Dict[int, List[int]]:
@@ -117,3 +118,33 @@ def instruction_duration_validation(duration: int):
         raise QiskitError(
             f"Instruction duration must be a non-negative integer, got {duration} instead."
         )
+
+
+def _validate_parameter_vector(parameter: ParameterVector, value):
+    """Validate parameter vector and its value."""
+    if not isinstance(value, Sequence):
+        raise PulseError(
+            f"Parameter vector '{parameter.name}' has length {len(parameter)},"
+            f" but was assigned to {value}."
+        )
+    if len(parameter) != len(value):
+        raise PulseError(
+            f"Parameter vector '{parameter.name}' has length {len(parameter)},"
+            f" but was assigned to {len(value)} values."
+        )
+
+
+def _validate_single_parameter(parameter: Parameter, value):
+    """Validate single parameter and its value."""
+    if not isinstance(value, (int, float, complex, ParameterExpression)):
+        raise PulseError(f"Parameter '{parameter.name}' is not assignable to {value}.")
+
+
+def _validate_parameter_value(parameter, value):
+    """Validate parameter and its value."""
+    if isinstance(parameter, ParameterVector):
+        _validate_parameter_vector(parameter, value)
+        return True
+    else:
+        _validate_single_parameter(parameter, value)
+        return False
diff --git a/qiskit/qasm3/ast.py b/qiskit/qasm3/ast.py
index fd7aa11d4819..7674eace89db 100644
--- a/qiskit/qasm3/ast.py
+++ b/qiskit/qasm3/ast.py
@@ -123,6 +123,10 @@ class FloatType(ClassicalType, enum.Enum):
     OCT = 256
 
 
+class BoolType(ClassicalType):
+    """Type information for a Boolean."""
+
+
 class IntType(ClassicalType):
     """Type information for a signed integer."""
 
@@ -130,6 +134,13 @@ def __init__(self, size: Optional[int] = None):
         self.size = size
 
 
+class UintType(ClassicalType):
+    """Type information for an unsigned integer."""
+
+    def __init__(self, size: Optional[int] = None):
+        self.size = size
+
+
 class BitType(ClassicalType):
     """Type information for a single bit."""
 
diff --git a/qiskit/qasm3/exporter.py b/qiskit/qasm3/exporter.py
index d8d3a42087a6..b85e35e22a50 100644
--- a/qiskit/qasm3/exporter.py
+++ b/qiskit/qasm3/exporter.py
@@ -33,6 +33,7 @@
     Qubit,
     Reset,
     Delay,
+    Store,
 )
 from qiskit.circuit.bit import Bit
 from qiskit.circuit.classical import expr, types
@@ -62,7 +63,6 @@
 _RESERVED_KEYWORDS = frozenset(
     {
         "OPENQASM",
-        "U",
         "angle",
         "array",
         "barrier",
@@ -239,6 +239,7 @@ class GlobalNamespace:
 
     def __init__(self, includelist, basis_gates=()):
         self._data = {gate: self.BASIS_GATE for gate in basis_gates}
+        self._data["U"] = self.BASIS_GATE
 
         for includefile in includelist:
             if includefile == "stdgates.inc":
@@ -282,6 +283,10 @@ def __contains__(self, instruction):
             return True
         return False
 
+    def has_symbol(self, name: str) -> bool:
+        """Whether a symbol's name is present in the table."""
+        return name in self._data
+
     def register(self, instruction):
         """Register an instruction in the namespace"""
         # The second part of the condition is a nasty hack to ensure that gates that come with at
@@ -324,7 +329,7 @@ def register(self, instruction):
 class QASM3Builder:
     """QASM3 builder constructs an AST from a QuantumCircuit."""
 
-    builtins = (Barrier, Measure, Reset, Delay, BreakLoopOp, ContinueLoopOp)
+    builtins = (Barrier, Measure, Reset, Delay, BreakLoopOp, ContinueLoopOp, Store)
     loose_bit_prefix = "_bit"
     loose_qubit_prefix = "_qubit"
     gate_parameter_prefix = "_gate_p"
@@ -348,14 +353,12 @@ def __init__(
         self.includeslist = includeslist
         # `_global_io_declarations` and `_global_classical_declarations` are stateful, and any
         # operation that needs a parameter can append to them during the build.  We make all
-        # classical declarations global because the IBM QSS stack (our initial consumer of OQ3
-        # strings) prefers declarations to all be global, and it's valid OQ3, so it's not vendor
+        # classical declarations global because the IBM qe-compiler stack (our initial consumer of
+        # OQ3 strings) prefers declarations to all be global, and it's valid OQ3, so it's not vendor
         # lock-in.  It's possibly slightly memory inefficient, but that's not likely to be a problem
         # in the near term.
         self._global_io_declarations = []
-        self._global_classical_declarations = []
-        self._gate_to_declare = {}
-        self._opaque_to_declare = {}
+        self._global_classical_forward_declarations = []
         # An arbitrary counter to help with generation of unique ids for symbol names when there are
         # clashes (though we generally prefer to keep user names if possible).
         self._counter = itertools.count()
@@ -367,18 +370,15 @@ def __init__(
     def _unique_name(self, prefix: str, scope: _Scope) -> str:
         table = scope.symbol_map
         name = basename = _escape_invalid_identifier(prefix)
-        while name in table or name in _RESERVED_KEYWORDS:
+        while name in table or name in _RESERVED_KEYWORDS or self.global_namespace.has_symbol(name):
             name = f"{basename}__generated{next(self._counter)}"
         return name
 
     def _register_gate(self, gate):
         self.global_namespace.register(gate)
-        self._gate_to_declare[id(gate)] = gate
 
     def _register_opaque(self, instruction):
-        if instruction not in self.global_namespace:
-            self.global_namespace.register(instruction)
-            self._opaque_to_declare[id(instruction)] = instruction
+        self.global_namespace.register(instruction)
 
     def _register_variable(self, variable, scope: _Scope, name=None) -> ast.Identifier:
         """Register a variable in the symbol table for the given scope, returning the name that
@@ -399,6 +399,10 @@ def _register_variable(self, variable, scope: _Scope, name=None) -> ast.Identifi
                 raise QASM3ExporterError(
                     f"tried to reserve '{name}', but it is already used by '{table[name]}'"
                 )
+            if self.global_namespace.has_symbol(name):
+                raise QASM3ExporterError(
+                    f"tried to reserve '{name}', but it is already used by a gate"
+                )
         else:
             name = self._unique_name(variable.name, scope)
         identifier = ast.Identifier(name)
@@ -441,15 +445,66 @@ def build_header(self):
 
     def build_program(self):
         """Builds a Program"""
-        self.hoist_declarations(self.global_scope(assert_=True).circuit.data)
-        return ast.Program(self.build_header(), self.build_global_statements())
+        circuit = self.global_scope(assert_=True).circuit
+        if circuit.num_captured_vars:
+            raise QASM3ExporterError(
+                "cannot export an inner scope with captured variables as a top-level program"
+            )
+        header = self.build_header()
+
+        opaques_to_declare, gates_to_declare = self.hoist_declarations(
+            circuit.data, opaques=[], gates=[]
+        )
+        opaque_definitions = [
+            self.build_opaque_definition(instruction) for instruction in opaques_to_declare
+        ]
+        gate_definitions = [
+            self.build_gate_definition(instruction) for instruction in gates_to_declare
+        ]
+
+        # Early IBM runtime paramterisation uses unbound `Parameter` instances as `input` variables,
+        # not the explicit realtime `Var` variables, so we need this explicit scan.
+        self.hoist_global_parameter_declarations()
+        # Qiskit's clbits and classical registers need to get mapped to implicit OQ3 variables, but
+        # only if they're in the top-level circuit.  The QuantumCircuit data model is that inner
+        # clbits are bound to outer bits, and inner registers must be closing over outer ones.
+        self.hoist_classical_register_declarations()
+        # We hoist registers before new-style vars because registers are an older part of the data
+        # model (and used implicitly in PrimitivesV2 outputs) so they get the first go at reserving
+        # names in the symbol table.
+        self.hoist_classical_io_var_declarations()
+
+        # Similarly, QuantumCircuit qubits/registers are only new variables in the global scope.
+        quantum_declarations = self.build_quantum_declarations()
+        # This call has side-effects - it can populate `self._global_io_declarations` and
+        # `self._global_classical_declarations` as a courtesy to the qe-compiler that prefers our
+        # hacky temporary `switch` target variables to be globally defined.
+        main_statements = self.build_current_scope()
 
-    def hoist_declarations(self, instructions):
-        """Walks the definitions in gates/instructions to make a list of gates to declare."""
+        statements = [
+            statement
+            for source in (
+                # In older versions of the reference OQ3 grammar, IO declarations had to come before
+                # anything else, so we keep doing that as a courtesy.
+                self._global_io_declarations,
+                opaque_definitions,
+                gate_definitions,
+                self._global_classical_forward_declarations,
+                quantum_declarations,
+                main_statements,
+            )
+            for statement in source
+        ]
+        return ast.Program(header, statements)
+
+    def hoist_declarations(self, instructions, *, opaques, gates):
+        """Walks the definitions in gates/instructions to make a list of gates to declare.
+
+        Mutates ``opaques`` and ``gates`` in-place if given, and returns them."""
         for instruction in instructions:
             if isinstance(instruction.operation, ControlFlowOp):
                 for block in instruction.operation.blocks:
-                    self.hoist_declarations(block.data)
+                    self.hoist_declarations(block.data, opaques=opaques, gates=gates)
                 continue
             if instruction.operation in self.global_namespace or isinstance(
                 instruction.operation, self.builtins
@@ -461,15 +516,20 @@ def hoist_declarations(self, instructions):
                 # tree, but isn't an OQ3 built-in.  We use `isinstance` because we haven't fully
                 # fixed what the name/class distinction is (there's a test from the original OQ3
                 # exporter that tries a naming collision with 'cx').
-                if instruction.operation not in self.global_namespace:
-                    self._register_gate(instruction.operation)
-            if instruction.operation.definition is None:
+                self._register_gate(instruction.operation)
+                gates.append(instruction.operation)
+            elif instruction.operation.definition is None:
                 self._register_opaque(instruction.operation)
+                opaques.append(instruction.operation)
             elif not isinstance(instruction.operation, Gate):
                 raise QASM3ExporterError("Exporting non-unitary instructions is not yet supported.")
             else:
-                self.hoist_declarations(instruction.operation.definition.data)
+                self.hoist_declarations(
+                    instruction.operation.definition.data, opaques=opaques, gates=gates
+                )
                 self._register_gate(instruction.operation)
+                gates.append(instruction.operation)
+        return opaques, gates
 
     def global_scope(self, assert_=False):
         """Return the global circuit scope that is used as the basis of the full program.  If
@@ -540,40 +600,6 @@ def build_includes(self):
         """Builds a list of included files."""
         return [ast.Include(filename) for filename in self.includeslist]
 
-    def build_global_statements(self) -> List[ast.Statement]:
-        """Get a list of the statements that form the global scope of the program."""
-        definitions = self.build_definitions()
-        # These two "declarations" functions populate stateful variables, since the calls to
-        # `build_quantum_instructions` might also append to those declarations.
-        self.build_parameter_declarations()
-        self.build_classical_declarations()
-        context = self.global_scope(assert_=True).circuit
-        quantum_declarations = self.build_quantum_declarations()
-        quantum_instructions = self.build_quantum_instructions(context.data)
-
-        return [
-            statement
-            for source in (
-                # In older versions of the reference OQ3 grammar, IO declarations had to come before
-                # anything else, so we keep doing that as a courtesy.
-                self._global_io_declarations,
-                definitions,
-                self._global_classical_declarations,
-                quantum_declarations,
-                quantum_instructions,
-            )
-            for statement in source
-        ]
-
-    def build_definitions(self):
-        """Builds all the definition."""
-        ret = []
-        for instruction in self._opaque_to_declare.values():
-            ret.append(self.build_opaque_definition(instruction))
-        for instruction in self._gate_to_declare.values():
-            ret.append(self.build_gate_definition(instruction))
-        return ret
-
     def build_opaque_definition(self, instruction):
         """Builds an Opaque gate definition as a CalibrationDefinition"""
         # We can't do anything sensible with this yet, so it's better to loudly say that.
@@ -604,7 +630,7 @@ def build_gate_definition(self, gate):
 
         self.push_context(gate.definition)
         signature = self.build_gate_signature(gate)
-        body = ast.QuantumBlock(self.build_quantum_instructions(gate.definition.data))
+        body = ast.QuantumBlock(self.build_current_scope())
         self.pop_context()
         return ast.QuantumGateDefinition(signature, body)
 
@@ -627,8 +653,10 @@ def build_gate_signature(self, gate):
         ]
         return ast.QuantumGateSignature(ast.Identifier(name), quantum_arguments, params or None)
 
-    def build_parameter_declarations(self):
-        """Builds lists of the input, output and standard variables used in this program."""
+    def hoist_global_parameter_declarations(self):
+        """Extend ``self._global_io_declarations`` and ``self._global_classical_declarations`` with
+        any implicit declarations used to support the early IBM efforts to use :class:`.Parameter`
+        as an input variable."""
         global_scope = self.global_scope(assert_=True)
         for parameter in global_scope.circuit.parameters:
             parameter_name = self._register_variable(parameter, global_scope)
@@ -640,11 +668,13 @@ def build_parameter_declarations(self):
             if isinstance(declaration, ast.IODeclaration):
                 self._global_io_declarations.append(declaration)
             else:
-                self._global_classical_declarations.append(declaration)
+                self._global_classical_forward_declarations.append(declaration)
 
-    def build_classical_declarations(self):
-        """Extend the global classical declarations with AST nodes declaring all the classical bits
-        and registers.
+    def hoist_classical_register_declarations(self):
+        """Extend the global classical declarations with AST nodes declaring all the global-scope
+        circuit :class:`.Clbit` and :class:`.ClassicalRegister` instances.  Qiskit's data model
+        doesn't involve the declaration of *new* bits or registers in inner scopes; only the
+        :class:`.expr.Var` mechanism allows that.
 
         The behaviour of this function depends on the setting ``allow_aliasing``. If this
         is ``True``, then the output will be in the same form as the output of
@@ -670,12 +700,14 @@ def build_classical_declarations(self):
                 )
                 for i, clbit in enumerate(scope.circuit.clbits)
             )
-            self._global_classical_declarations.extend(clbits)
-            self._global_classical_declarations.extend(self.build_aliases(scope.circuit.cregs))
+            self._global_classical_forward_declarations.extend(clbits)
+            self._global_classical_forward_declarations.extend(
+                self.build_aliases(scope.circuit.cregs)
+            )
             return
         # If we're here, we're in the clbit happy path where there are no clbits that are in more
         # than one register.  We can output things very naturally.
-        self._global_classical_declarations.extend(
+        self._global_classical_forward_declarations.extend(
             ast.ClassicalDeclaration(
                 ast.BitType(),
                 self._register_variable(
@@ -691,10 +723,26 @@ def build_classical_declarations(self):
                 scope.symbol_map[bit] = ast.SubscriptedIdentifier(
                     name.string, ast.IntegerLiteral(i)
                 )
-            self._global_classical_declarations.append(
+            self._global_classical_forward_declarations.append(
                 ast.ClassicalDeclaration(ast.BitArrayType(len(register)), name)
             )
 
+    def hoist_classical_io_var_declarations(self):
+        """Hoist the declarations of classical IO :class:`.expr.Var` nodes into the global state.
+
+        Local :class:`.expr.Var` declarations are handled by the regular local-block scope builder,
+        and the :class:`.QuantumCircuit` data model ensures that the only time an IO variable can
+        occur is in an outermost block."""
+        scope = self.global_scope(assert_=True)
+        for var in scope.circuit.iter_input_vars():
+            self._global_io_declarations.append(
+                ast.IODeclaration(
+                    ast.IOModifier.INPUT,
+                    _build_ast_type(var.type),
+                    self._register_variable(var, scope),
+                )
+            )
+
     def build_quantum_declarations(self):
         """Return a list of AST nodes declaring all the qubits in the current scope, and all the
         alias declarations for these qubits."""
@@ -760,21 +808,37 @@ def build_aliases(self, registers: Iterable[Register]) -> List[ast.AliasStatemen
             out.append(ast.AliasStatement(name, ast.IndexSet(elements)))
         return out
 
-    def build_quantum_instructions(self, instructions):
-        """Builds a list of call statements"""
-        ret = []
-        for instruction in instructions:
-            if isinstance(instruction.operation, ForLoopOp):
-                ret.append(self.build_for_loop(instruction))
-                continue
-            if isinstance(instruction.operation, WhileLoopOp):
-                ret.append(self.build_while_loop(instruction))
-                continue
-            if isinstance(instruction.operation, IfElseOp):
-                ret.append(self.build_if_statement(instruction))
-                continue
-            if isinstance(instruction.operation, SwitchCaseOp):
-                ret.extend(self.build_switch_statement(instruction))
+    def build_current_scope(self) -> List[ast.Statement]:
+        """Build the instructions that occur in the current scope.
+
+        In addition to everything literally in the circuit's ``data`` field, this also includes
+        declarations for any local :class:`.expr.Var` nodes.
+        """
+        scope = self.current_scope()
+
+        # We forward-declare all local variables uninitialised at the top of their scope. It would
+        # be nice to declare the variable at the point of first store (so we can write things like
+        # `uint[8] a = 12;`), but there's lots of edge-case logic to catch with that around
+        # use-before-definition errors in the OQ3 output, for example if the user has side-stepped
+        # the `QuantumCircuit` API protection to produce a circuit that uses an uninitialised
+        # variable, or the initial write to a variable is within a control-flow scope.  (It would be
+        # easier to see the def/use chain needed to do this cleanly if we were using `DAGCircuit`.)
+        statements = [
+            ast.ClassicalDeclaration(_build_ast_type(var.type), self._register_variable(var, scope))
+            for var in scope.circuit.iter_declared_vars()
+        ]
+        for instruction in scope.circuit.data:
+            if isinstance(instruction.operation, ControlFlowOp):
+                if isinstance(instruction.operation, ForLoopOp):
+                    statements.append(self.build_for_loop(instruction))
+                elif isinstance(instruction.operation, WhileLoopOp):
+                    statements.append(self.build_while_loop(instruction))
+                elif isinstance(instruction.operation, IfElseOp):
+                    statements.append(self.build_if_statement(instruction))
+                elif isinstance(instruction.operation, SwitchCaseOp):
+                    statements.extend(self.build_switch_statement(instruction))
+                else:  # pragma: no cover
+                    raise RuntimeError(f"unhandled control-flow construct: {instruction.operation}")
                 continue
             # Build the node, ignoring any condition.
             if isinstance(instruction.operation, Gate):
@@ -795,6 +859,13 @@ def build_quantum_instructions(self, instructions):
                 ]
             elif isinstance(instruction.operation, Delay):
                 nodes = [self.build_delay(instruction)]
+            elif isinstance(instruction.operation, Store):
+                nodes = [
+                    ast.AssignmentStatement(
+                        self.build_expression(instruction.operation.lvalue),
+                        self.build_expression(instruction.operation.rvalue),
+                    )
+                ]
             elif isinstance(instruction.operation, BreakLoopOp):
                 nodes = [ast.BreakStatement()]
             elif isinstance(instruction.operation, ContinueLoopOp):
@@ -803,16 +874,16 @@ def build_quantum_instructions(self, instructions):
                 nodes = [self.build_subroutine_call(instruction)]
 
             if instruction.operation.condition is None:
-                ret.extend(nodes)
+                statements.extend(nodes)
             else:
                 body = ast.ProgramBlock(nodes)
-                ret.append(
+                statements.append(
                     ast.BranchingStatement(
                         self.build_expression(_lift_condition(instruction.operation.condition)),
                         body,
                     )
                 )
-        return ret
+        return statements
 
     def build_if_statement(self, instruction: CircuitInstruction) -> ast.BranchingStatement:
         """Build an :obj:`.IfElseOp` into a :obj:`.ast.BranchingStatement`."""
@@ -820,14 +891,14 @@ def build_if_statement(self, instruction: CircuitInstruction) -> ast.BranchingSt
 
         true_circuit = instruction.operation.blocks[0]
         self.push_scope(true_circuit, instruction.qubits, instruction.clbits)
-        true_body = self.build_program_block(true_circuit.data)
+        true_body = ast.ProgramBlock(self.build_current_scope())
         self.pop_scope()
         if len(instruction.operation.blocks) == 1:
             return ast.BranchingStatement(condition, true_body, None)
 
         false_circuit = instruction.operation.blocks[1]
         self.push_scope(false_circuit, instruction.qubits, instruction.clbits)
-        false_body = self.build_program_block(false_circuit.data)
+        false_body = ast.ProgramBlock(self.build_current_scope())
         self.pop_scope()
         return ast.BranchingStatement(condition, true_body, false_body)
 
@@ -838,7 +909,7 @@ def build_switch_statement(self, instruction: CircuitInstruction) -> Iterable[as
         target = self._reserve_variable_name(
             ast.Identifier(self._unique_name("switch_dummy", global_scope)), global_scope
         )
-        self._global_classical_declarations.append(
+        self._global_classical_forward_declarations.append(
             ast.ClassicalDeclaration(ast.IntType(), target, None)
         )
 
@@ -851,7 +922,7 @@ def case(values, case_block):
                     for v in values
                 ]
                 self.push_scope(case_block, instruction.qubits, instruction.clbits)
-                case_body = self.build_program_block(case_block.data)
+                case_body = ast.ProgramBlock(self.build_current_scope())
                 self.pop_scope()
                 return values, case_body
 
@@ -871,7 +942,7 @@ def case(values, case_block):
         default = None
         for values, block in instruction.operation.cases_specifier():
             self.push_scope(block, instruction.qubits, instruction.clbits)
-            case_body = self.build_program_block(block.data)
+            case_body = ast.ProgramBlock(self.build_current_scope())
             self.pop_scope()
             if CASE_DEFAULT in values:
                 # Even if it's mixed in with other cases, we can skip them and only output the
@@ -891,7 +962,7 @@ def build_while_loop(self, instruction: CircuitInstruction) -> ast.WhileLoopStat
         condition = self.build_expression(_lift_condition(instruction.operation.condition))
         loop_circuit = instruction.operation.blocks[0]
         self.push_scope(loop_circuit, instruction.qubits, instruction.clbits)
-        loop_body = self.build_program_block(loop_circuit.data)
+        loop_body = ast.ProgramBlock(self.build_current_scope())
         self.pop_scope()
         return ast.WhileLoopStatement(condition, loop_body)
 
@@ -921,7 +992,7 @@ def build_for_loop(self, instruction: CircuitInstruction) -> ast.ForLoopStatemen
                     "The values in OpenQASM 3 'for' loops must all be integers, but received"
                     f" '{indexset}'."
                 ) from None
-        body_ast = self.build_program_block(loop_circuit)
+        body_ast = ast.ProgramBlock(self.build_current_scope())
         self.pop_scope()
         return ast.ForLoopStatement(indexset_ast, loop_parameter_ast, body_ast)
 
@@ -961,10 +1032,6 @@ def build_integer(self, value) -> ast.IntegerLiteral:
             raise QASM3ExporterError(f"'{value}' is not an integer")  # pragma: no cover
         return ast.IntegerLiteral(int(value))
 
-    def build_program_block(self, instructions):
-        """Builds a ProgramBlock"""
-        return ast.ProgramBlock(self.build_quantum_instructions(instructions))
-
     def _rebind_scoped_parameters(self, expression):
         """If the input is a :class:`.ParameterExpression`, rebind any internal
         :class:`.Parameter`\\ s so that their names match their names in the scope.  Other inputs
@@ -1008,8 +1075,8 @@ def _infer_variable_declaration(
 
     This is very simplistic; it assumes all parameters are real numbers that need to be input to the
     program, unless one is used as a loop variable, in which case it shouldn't be declared at all,
-    because the ``for`` loop declares it implicitly (per the Qiskit/QSS reading of the OpenQASM
-    spec at Qiskit/openqasm@8ee55ec).
+    because the ``for`` loop declares it implicitly (per the Qiskit/qe-compiler reading of the
+    OpenQASM spec at openqasm/openqasm@8ee55ec).
 
     .. note::
 
@@ -1058,6 +1125,14 @@ def _lift_condition(condition):
     return expr.lift_legacy_condition(condition)
 
 
+def _build_ast_type(type_: types.Type) -> ast.ClassicalType:
+    if type_.kind is types.Bool:
+        return ast.BoolType()
+    if type_.kind is types.Uint:
+        return ast.UintType(type_.width)
+    raise RuntimeError(f"unhandled expr type '{type_}'")  # pragma: no cover
+
+
 class _ExprBuilder(expr.ExprVisitor[ast.Expression]):
     __slots__ = ("lookup",)
 
@@ -1069,7 +1144,7 @@ def __init__(self, lookup):
         self.lookup = lookup
 
     def visit_var(self, node, /):
-        return self.lookup(node.var)
+        return self.lookup(node) if node.standalone else self.lookup(node.var)
 
     def visit_value(self, node, /):
         if node.type.kind is types.Bool:
@@ -1080,14 +1155,8 @@ def visit_value(self, node, /):
 
     def visit_cast(self, node, /):
         if node.implicit:
-            return node.accept(self)
-        if node.type.kind is types.Bool:
-            oq3_type = ast.BoolType()
-        elif node.type.kind is types.Uint:
-            oq3_type = ast.BitArrayType(node.type.width)
-        else:
-            raise RuntimeError(f"unhandled cast type '{node.type}'")
-        return ast.Cast(oq3_type, node.operand.accept(self))
+            return node.operand.accept(self)
+        return ast.Cast(_build_ast_type(node.type), node.operand.accept(self))
 
     def visit_unary(self, node, /):
         return ast.Unary(ast.Unary.Op[node.op.name], node.operand.accept(self))
diff --git a/qiskit/qasm3/printer.py b/qiskit/qasm3/printer.py
index 94d12a7ecff6..ba253144a168 100644
--- a/qiskit/qasm3/printer.py
+++ b/qiskit/qasm3/printer.py
@@ -204,11 +204,19 @@ def _visit_CalibrationGrammarDeclaration(self, node: ast.CalibrationGrammarDecla
     def _visit_FloatType(self, node: ast.FloatType) -> None:
         self.stream.write(f"float[{self._FLOAT_WIDTH_LOOKUP[node]}]")
 
+    def _visit_BoolType(self, _node: ast.BoolType) -> None:
+        self.stream.write("bool")
+
     def _visit_IntType(self, node: ast.IntType) -> None:
         self.stream.write("int")
         if node.size is not None:
             self.stream.write(f"[{node.size}]")
 
+    def _visit_UintType(self, node: ast.UintType) -> None:
+        self.stream.write("uint")
+        if node.size is not None:
+            self.stream.write(f"[{node.size}]")
+
     def _visit_BitType(self, _node: ast.BitType) -> None:
         self.stream.write("bit")
 
diff --git a/qiskit/qpy/__init__.py b/qiskit/qpy/__init__.py
index 1274ff110be6..7851db5c2a10 100644
--- a/qiskit/qpy/__init__.py
+++ b/qiskit/qpy/__init__.py
@@ -79,6 +79,12 @@
 
 .. autoexception:: QpyError
 
+When a lower-than-maximum target QPY version is set for serialization, but the object to be
+serialized contains features that cannot be represented in that format, a subclass of
+:exc:`QpyError` is raised:
+
+.. autoexception:: UnsupportedFeatureForVersion
+
 Attributes:
     QPY_VERSION (int): The current QPY format version as of this release. This
         is the default value of the ``version`` keyword argument on
@@ -111,6 +117,135 @@
 
 .. autoexception:: QPYLoadingDeprecatedFeatureWarning
 
+QPY format version history
+--------------------------
+
+If you're planning to load a QPY file between different Qiskit versions knowing
+which versions were available in a given release are useful. As the QPY is
+backwards compatible but not forwards compatible you need to ensure a given
+QPY format version was released in the release you're calling :func:`.load`
+with. The following table lists the QPY versions that were supported in every
+Qiskit (and qiskit-terra prior to Qiskit 1.0.0) release going back to the introduction
+of QPY in qiskit-terra 0.18.0.
+
+.. list-table: QPY Format Version History
+   :header-rows: 1
+
+   * - Qiskit (qiskit-terra for < 1.0.0) version
+     - :func:`.dump` format(s) output versions
+     - :func:`.load` maximum supported version (older format versions can always be read)
+   * - 1.1.0
+     - 10, 11, 12
+     - 12
+   * - 1.0.2
+     - 10, 11
+     - 12
+   * - 1.0.1
+     - 10, 11
+     - 11
+   * - 1.0.0
+     - 10, 11
+     - 11
+   * - 0.46.1
+     - 10
+     - 10
+   * - 0.45.3
+     - 10
+     - 10
+   * - 0.45.2
+     - 10
+     - 10
+   * - 0.45.1
+     - 10
+     - 10
+   * - 0.45.0
+     - 10
+     - 10
+   * - 0.25.3
+     - 9
+     - 9
+   * - 0.25.2
+     - 9
+     - 9
+   * - 0.25.1
+     - 9
+     - 9
+   * - 0.24.2
+     - 8
+     - 8
+   * - 0.24.1
+     - 7
+     - 7
+   * - 0.24.0
+     - 7
+     - 7
+   * - 0.23.3
+     - 6
+     - 6
+   * - 0.23.2
+     - 6
+     - 6
+   * - 0.23.1
+     - 6
+     - 6
+   * - 0.23.0
+     - 6
+     - 6
+   * - 0.22.4
+     - 5
+     - 5
+   * - 0.22.3
+     - 5
+     - 5
+   * - 0.22.2
+     - 5
+     - 5
+   * - 0.22.1
+     - 5
+     - 5
+   * - 0.22.0
+     - 5
+     - 5
+   * - 0.21.2
+     - 5
+     - 5
+   * - 0.21.1
+     - 5
+     - 5
+   * - 0.21.0
+     - 5
+     - 5
+   * - 0.20.2
+     - 4
+     - 4
+   * - 0.20.1
+     - 4
+     - 4
+   * - 0.20.0
+     - 4
+     - 4
+   * - 0.19.2
+     - 4
+     - 4
+   * - 0.19.1
+     - 3
+     - 3
+   * - 0.19.0
+     - 2
+     - 2
+   * - 0.18.3
+     - 1
+     - 1
+   * - 0.18.2
+     - 1
+     - 1
+   * - 0.18.1
+     - 1
+     - 1
+   * - 0.18.0
+     - 1
+     - 1
+
 .. _qpy_format:
 
 **********
@@ -156,12 +291,97 @@
 The file header is immediately followed by the circuit payloads.
 Each individual circuit is composed of the following parts:
 
-``HEADER | METADATA | REGISTERS | CUSTOM_DEFINITIONS | INSTRUCTIONS``
+``HEADER | METADATA | REGISTERS | STANDALONE_VARS | CUSTOM_DEFINITIONS | INSTRUCTIONS``
+
+The ``STANDALONE_VARS`` are new in QPY version 12; before that, there was no data between
+``REGISTERS`` and ``CUSTOM_DEFINITIONS``.
 
 There is a circuit payload for each circuit (where the total number is dictated
 by ``num_circuits`` in the file header). There is no padding between the
 circuits in the data.
 
+.. _qpy_version_12:
+
+Version 12
+==========
+
+Version 12 adds support for:
+
+* circuits containing memory-owning :class:`.expr.Var` variables.
+
+Changes to HEADER
+-----------------
+
+The HEADER struct for an individual circuit has added three ``uint32_t`` counts of the input,
+captured and locally declared variables in the circuit.  The new form looks like:
+
+.. code-block:: c
+
+    struct {
+        uint16_t name_size;
+        char global_phase_type;
+        uint16_t global_phase_size;
+        uint32_t num_qubits;
+        uint32_t num_clbits;
+        uint64_t metadata_size;
+        uint32_t num_registers;
+        uint64_t num_instructions;
+        uint32_t num_vars;
+    } HEADER_V12;
+
+The ``HEADER_V12`` struct is followed immediately by the same name, global-phase, metadata
+and register information as the V2 version of the header.  Immediately following the registers is
+``num_vars`` instances of ``EXPR_VAR_STANDALONE`` that define the variables in this circuit.  After
+that, the data continues with custom definitions and instructions as in prior versions of QPY.
+
+
+EXPR_VAR_DECLARATION
+--------------------
+
+An ``EXPR_VAR_DECLARATION`` defines an :class:`.expr.Var` instance that is standalone; that is, it
+represents a self-owned memory location rather than wrapping a :class:`.Clbit` or
+:class:`.ClassicalRegister`.  The payload is a C struct:
+
+.. code-block:: c
+
+    struct {
+        char uuid_bytes[16];
+        char usage;
+        uint16_t name_size;
+    }
+
+which is immediately followed by an ``EXPR_TYPE`` payload and then ``name_size`` bytes of UTF-8
+encoding string data containing the name of the variable.
+
+The ``char`` usage type code takes the following values:
+
+=========  =========================================================================================
+Type code  Meaning
+=========  =========================================================================================
+``I``      An ``input`` variable to the circuit.
+
+``C``      A ``capture`` variable to the circuit.
+
+``L``      A locally declared variable to the circuit.
+=========  =========================================================================================
+
+
+Changes to EXPR_VAR
+-------------------
+
+The EXPR_VAR variable has gained a new type code and payload, in addition to the pre-existing ones:
+
+===========================  =========  ============================================================
+Python class                 Type code  Payload
+===========================  =========  ============================================================
+:class:`.UUID`               ``U``      One ``uint32_t`` index of the variable into the series of
+                                        ``EXPR_VAR_STANDALONE`` variables that were written
+                                        immediately after the circuit header.
+===========================  =========  ============================================================
+
+Notably, this new type-code indexes into pre-defined variables from the circuit header, rather than
+redefining the variable again in each location it is used.
+
 .. _qpy_version_11:
 
 Version 11
@@ -208,17 +428,18 @@
 Version 10
 ==========
 
-Version 10 adds support for symengine-native serialization for objects of type
-:class:`~.ParameterExpression` as well as symbolic expressions in Pulse schedule blocks. Version
-10 also adds support for new fields in the :class:`~.TranspileLayout` class added in the Qiskit
-0.45.0 release.
+Version 10 adds support for:
+
+* symengine-native serialization for objects of type :class:`~.ParameterExpression` as well as
+  symbolic expressions in Pulse schedule blocks.
+* new fields in the :class:`~.TranspileLayout` class added in the Qiskit 0.45.0 release.
 
 The symbolic_encoding field is added to the file header, and a new encoding type char
 is introduced, mapped to each symbolic library as follows: ``p`` refers to sympy
 encoding and ``e`` refers to symengine encoding.
 
-FILE_HEADER
------------
+Changes to FILE_HEADER
+----------------------
 
 The contents of FILE_HEADER after V10 are defined as a C struct as:
 
@@ -231,10 +452,10 @@
         uint8_t qiskit_patch_version;
         uint64_t num_circuits;
         char symbolic_encoding;
-    }
+    } FILE_HEADER_V10;
 
-LAYOUT
-------
+Changes to LAYOUT
+-----------------
 
 The ``LAYOUT`` struct is updated to have an additional ``input_qubit_count`` field.
 With version 10 the ``LAYOUT`` struct is now:
@@ -1393,7 +1614,7 @@ class if it's defined in Qiskit. Otherwise it falls back to the custom
 .. [#f3] https://docs.python.org/3/c-api/complex.html#c.Py_complex
 """
 
-from .exceptions import QpyError, QPYLoadingDeprecatedFeatureWarning
+from .exceptions import QpyError, UnsupportedFeatureForVersion, QPYLoadingDeprecatedFeatureWarning
 from .interface import dump, load
 
 # For backward compatibility. Provide, Runtime, Experiment call these private functions.
diff --git a/qiskit/qpy/binary_io/circuits.py b/qiskit/qpy/binary_io/circuits.py
index 40bb5850043b..1cf003ff3585 100644
--- a/qiskit/qpy/binary_io/circuits.py
+++ b/qiskit/qpy/binary_io/circuits.py
@@ -40,13 +40,39 @@
 from qiskit.circuit.instruction import Instruction
 from qiskit.circuit.quantumcircuit import QuantumCircuit
 from qiskit.circuit.quantumregister import QuantumRegister, Qubit
-from qiskit.qpy import common, formats, type_keys
+from qiskit.qpy import common, formats, type_keys, exceptions
 from qiskit.qpy.binary_io import value, schedules
 from qiskit.quantum_info.operators import SparsePauliOp, Clifford
 from qiskit.synthesis import evolution as evo_synth
 from qiskit.transpiler.layout import Layout, TranspileLayout
 
 
+def _read_header_v12(file_obj, version, vectors, metadata_deserializer=None):
+    data = formats.CIRCUIT_HEADER_V12._make(
+        struct.unpack(
+            formats.CIRCUIT_HEADER_V12_PACK, file_obj.read(formats.CIRCUIT_HEADER_V12_SIZE)
+        )
+    )
+    name = file_obj.read(data.name_size).decode(common.ENCODE)
+    global_phase = value.loads_value(
+        data.global_phase_type,
+        file_obj.read(data.global_phase_size),
+        version=version,
+        vectors=vectors,
+    )
+    header = {
+        "global_phase": global_phase,
+        "num_qubits": data.num_qubits,
+        "num_clbits": data.num_clbits,
+        "num_registers": data.num_registers,
+        "num_instructions": data.num_instructions,
+        "num_vars": data.num_vars,
+    }
+    metadata_raw = file_obj.read(data.metadata_size)
+    metadata = json.loads(metadata_raw, cls=metadata_deserializer)
+    return header, name, metadata
+
+
 def _read_header_v2(file_obj, version, vectors, metadata_deserializer=None):
     data = formats.CIRCUIT_HEADER_V2._make(
         struct.unpack(
@@ -133,7 +159,14 @@ def _read_registers(file_obj, num_registers):
 
 
 def _loads_instruction_parameter(
-    type_key, data_bytes, version, vectors, registers, circuit, use_symengine
+    type_key,
+    data_bytes,
+    version,
+    vectors,
+    registers,
+    circuit,
+    use_symengine,
+    standalone_vars,
 ):
     if type_key == type_keys.Program.CIRCUIT:
         param = common.data_from_binary(data_bytes, read_circuit, version=version)
@@ -152,6 +185,7 @@ def _loads_instruction_parameter(
                 registers=registers,
                 circuit=circuit,
                 use_symengine=use_symengine,
+                standalone_vars=standalone_vars,
             )
         )
     elif type_key == type_keys.Value.INTEGER:
@@ -172,6 +206,7 @@ def _loads_instruction_parameter(
             clbits=clbits,
             cregs=registers["c"],
             use_symengine=use_symengine,
+            standalone_vars=standalone_vars,
         )
 
     return param
@@ -186,7 +221,14 @@ def _loads_register_param(data_bytes, circuit, registers):
 
 
 def _read_instruction(
-    file_obj, circuit, registers, custom_operations, version, vectors, use_symengine
+    file_obj,
+    circuit,
+    registers,
+    custom_operations,
+    version,
+    vectors,
+    use_symengine,
+    standalone_vars,
 ):
     if version < 5:
         instruction = formats.CIRCUIT_INSTRUCTION._make(
@@ -224,6 +266,7 @@ def _read_instruction(
             clbits=circuit.clbits,
             cregs=registers["c"],
             use_symengine=use_symengine,
+            standalone_vars=standalone_vars,
         )
     # Load Arguments
     if circuit is not None:
@@ -252,14 +295,28 @@ def _read_instruction(
     for _param in range(instruction.num_parameters):
         type_key, data_bytes = common.read_generic_typed_data(file_obj)
         param = _loads_instruction_parameter(
-            type_key, data_bytes, version, vectors, registers, circuit, use_symengine
+            type_key,
+            data_bytes,
+            version,
+            vectors,
+            registers,
+            circuit,
+            use_symengine,
+            standalone_vars,
         )
         params.append(param)
 
     # Load Gate object
     if gate_name in {"Gate", "Instruction", "ControlledGate"}:
         inst_obj = _parse_custom_operation(
-            custom_operations, gate_name, params, version, vectors, registers, use_symengine
+            custom_operations,
+            gate_name,
+            params,
+            version,
+            vectors,
+            registers,
+            use_symengine,
+            standalone_vars,
         )
         inst_obj.condition = condition
         if instruction.label_size > 0:
@@ -270,7 +327,14 @@ def _read_instruction(
         return None
     elif gate_name in custom_operations:
         inst_obj = _parse_custom_operation(
-            custom_operations, gate_name, params, version, vectors, registers, use_symengine
+            custom_operations,
+            gate_name,
+            params,
+            version,
+            vectors,
+            registers,
+            use_symengine,
+            standalone_vars,
         )
         inst_obj.condition = condition
         if instruction.label_size > 0:
@@ -361,7 +425,14 @@ def _read_instruction(
 
 
 def _parse_custom_operation(
-    custom_operations, gate_name, params, version, vectors, registers, use_symengine
+    custom_operations,
+    gate_name,
+    params,
+    version,
+    vectors,
+    registers,
+    use_symengine,
+    standalone_vars,
 ):
     if version >= 5:
         (
@@ -394,7 +465,14 @@ def _parse_custom_operation(
     if version >= 5 and type_key == type_keys.CircuitInstruction.CONTROLLED_GATE:
         with io.BytesIO(base_gate_raw) as base_gate_obj:
             base_gate = _read_instruction(
-                base_gate_obj, None, registers, custom_operations, version, vectors, use_symengine
+                base_gate_obj,
+                None,
+                registers,
+                custom_operations,
+                version,
+                vectors,
+                use_symengine,
+                standalone_vars,
             )
         if ctrl_state < 2**num_ctrl_qubits - 1:
             # If open controls, we need to discard the control suffix when setting the name.
@@ -413,7 +491,14 @@ def _parse_custom_operation(
     if version >= 11 and type_key == type_keys.CircuitInstruction.ANNOTATED_OPERATION:
         with io.BytesIO(base_gate_raw) as base_gate_obj:
             base_gate = _read_instruction(
-                base_gate_obj, None, registers, custom_operations, version, vectors, use_symengine
+                base_gate_obj,
+                None,
+                registers,
+                custom_operations,
+                version,
+                vectors,
+                use_symengine,
+                standalone_vars,
             )
         inst_obj = AnnotatedOperation(base_op=base_gate, modifiers=params)
         return inst_obj
@@ -572,10 +657,12 @@ def _dumps_register(register, index_map):
     return b"\x00" + str(index_map["c"][register]).encode(common.ENCODE)
 
 
-def _dumps_instruction_parameter(param, index_map, use_symengine):
+def _dumps_instruction_parameter(
+    param, index_map, use_symengine, *, version, standalone_var_indices
+):
     if isinstance(param, QuantumCircuit):
         type_key = type_keys.Program.CIRCUIT
-        data_bytes = common.data_to_binary(param, write_circuit)
+        data_bytes = common.data_to_binary(param, write_circuit, version=version)
     elif isinstance(param, Modifier):
         type_key = type_keys.Value.MODIFIER
         data_bytes = common.data_to_binary(param, _write_modifier)
@@ -585,7 +672,12 @@ def _dumps_instruction_parameter(param, index_map, use_symengine):
     elif isinstance(param, tuple):
         type_key = type_keys.Container.TUPLE
         data_bytes = common.sequence_to_binary(
-            param, _dumps_instruction_parameter, index_map=index_map, use_symengine=use_symengine
+            param,
+            _dumps_instruction_parameter,
+            index_map=index_map,
+            use_symengine=use_symengine,
+            version=version,
+            standalone_var_indices=standalone_var_indices,
         )
     elif isinstance(param, int):
         # TODO This uses little endian. This should be fixed in next QPY version.
@@ -600,14 +692,25 @@ def _dumps_instruction_parameter(param, index_map, use_symengine):
         data_bytes = _dumps_register(param, index_map)
     else:
         type_key, data_bytes = value.dumps_value(
-            param, index_map=index_map, use_symengine=use_symengine
+            param,
+            index_map=index_map,
+            use_symengine=use_symengine,
+            standalone_var_indices=standalone_var_indices,
         )
 
     return type_key, data_bytes
 
 
 # pylint: disable=too-many-boolean-expressions
-def _write_instruction(file_obj, instruction, custom_operations, index_map, use_symengine, version):
+def _write_instruction(
+    file_obj,
+    instruction,
+    custom_operations,
+    index_map,
+    use_symengine,
+    version,
+    standalone_var_indices=None,
+):
     if isinstance(instruction.operation, Instruction):
         gate_class_name = instruction.operation.base_class.__name__
     else:
@@ -702,7 +805,12 @@ def _write_instruction(file_obj, instruction, custom_operations, index_map, use_
     file_obj.write(gate_class_name)
     file_obj.write(label_raw)
     if condition_type is type_keys.Condition.EXPRESSION:
-        value.write_value(file_obj, op_condition, index_map=index_map)
+        value.write_value(
+            file_obj,
+            op_condition,
+            index_map=index_map,
+            standalone_var_indices=standalone_var_indices,
+        )
     else:
         file_obj.write(condition_register)
     # Encode instruction args
@@ -718,7 +826,13 @@ def _write_instruction(file_obj, instruction, custom_operations, index_map, use_
         file_obj.write(instruction_arg_raw)
     # Encode instruction params
     for param in instruction_params:
-        type_key, data_bytes = _dumps_instruction_parameter(param, index_map, use_symengine)
+        type_key, data_bytes = _dumps_instruction_parameter(
+            param,
+            index_map,
+            use_symengine,
+            version=version,
+            standalone_var_indices=standalone_var_indices,
+        )
         common.write_generic_typed_data(file_obj, type_key, data_bytes)
     return custom_operations_list
 
@@ -788,7 +902,9 @@ def _write_modifier(file_obj, modifier):
     file_obj.write(modifier_data)
 
 
-def _write_custom_operation(file_obj, name, operation, custom_operations, use_symengine, version):
+def _write_custom_operation(
+    file_obj, name, operation, custom_operations, use_symengine, version, *, standalone_var_indices
+):
     type_key = type_keys.CircuitInstruction.assign(operation)
     has_definition = False
     size = 0
@@ -813,7 +929,7 @@ def _write_custom_operation(file_obj, name, operation, custom_operations, use_sy
         # Build internal definition to support overloaded subclasses by
         # calling definition getter on object
         operation.definition  # pylint: disable=pointless-statement
-        data = common.data_to_binary(operation._definition, write_circuit)
+        data = common.data_to_binary(operation._definition, write_circuit, version=version)
         size = len(data)
         num_ctrl_qubits = operation.num_ctrl_qubits
         ctrl_state = operation.ctrl_state
@@ -823,7 +939,7 @@ def _write_custom_operation(file_obj, name, operation, custom_operations, use_sy
         base_gate = operation.base_op
     elif operation.definition is not None:
         has_definition = True
-        data = common.data_to_binary(operation.definition, write_circuit)
+        data = common.data_to_binary(operation.definition, write_circuit, version=version)
         size = len(data)
     if base_gate is None:
         base_gate_raw = b""
@@ -836,6 +952,7 @@ def _write_custom_operation(file_obj, name, operation, custom_operations, use_sy
                 {},
                 use_symengine,
                 version,
+                standalone_var_indices=standalone_var_indices,
             )
             base_gate_raw = base_gate_buffer.getvalue()
     name_raw = name.encode(common.ENCODE)
@@ -1103,23 +1220,49 @@ def write_circuit(
     num_registers = num_qregs + num_cregs
 
     # Write circuit header
-    header_raw = formats.CIRCUIT_HEADER_V2(
-        name_size=len(circuit_name),
-        global_phase_type=global_phase_type,
-        global_phase_size=len(global_phase_data),
-        num_qubits=circuit.num_qubits,
-        num_clbits=circuit.num_clbits,
-        metadata_size=metadata_size,
-        num_registers=num_registers,
-        num_instructions=num_instructions,
-    )
-    header = struct.pack(formats.CIRCUIT_HEADER_V2_PACK, *header_raw)
-    file_obj.write(header)
-    file_obj.write(circuit_name)
-    file_obj.write(global_phase_data)
-    file_obj.write(metadata_raw)
-    # Write header payload
-    file_obj.write(registers_raw)
+    if version >= 12:
+        header_raw = formats.CIRCUIT_HEADER_V12(
+            name_size=len(circuit_name),
+            global_phase_type=global_phase_type,
+            global_phase_size=len(global_phase_data),
+            num_qubits=circuit.num_qubits,
+            num_clbits=circuit.num_clbits,
+            metadata_size=metadata_size,
+            num_registers=num_registers,
+            num_instructions=num_instructions,
+            num_vars=circuit.num_vars,
+        )
+        header = struct.pack(formats.CIRCUIT_HEADER_V12_PACK, *header_raw)
+        file_obj.write(header)
+        file_obj.write(circuit_name)
+        file_obj.write(global_phase_data)
+        file_obj.write(metadata_raw)
+        # Write header payload
+        file_obj.write(registers_raw)
+        standalone_var_indices = value.write_standalone_vars(file_obj, circuit)
+    else:
+        if circuit.num_vars:
+            raise exceptions.UnsupportedFeatureForVersion(
+                "circuits containing realtime variables", required=12, target=version
+            )
+        header_raw = formats.CIRCUIT_HEADER_V2(
+            name_size=len(circuit_name),
+            global_phase_type=global_phase_type,
+            global_phase_size=len(global_phase_data),
+            num_qubits=circuit.num_qubits,
+            num_clbits=circuit.num_clbits,
+            metadata_size=metadata_size,
+            num_registers=num_registers,
+            num_instructions=num_instructions,
+        )
+        header = struct.pack(formats.CIRCUIT_HEADER_V2_PACK, *header_raw)
+        file_obj.write(header)
+        file_obj.write(circuit_name)
+        file_obj.write(global_phase_data)
+        file_obj.write(metadata_raw)
+        file_obj.write(registers_raw)
+        standalone_var_indices = {}
+
     instruction_buffer = io.BytesIO()
     custom_operations = {}
     index_map = {}
@@ -1127,7 +1270,13 @@ def write_circuit(
     index_map["c"] = {bit: index for index, bit in enumerate(circuit.clbits)}
     for instruction in circuit.data:
         _write_instruction(
-            instruction_buffer, instruction, custom_operations, index_map, use_symengine, version
+            instruction_buffer,
+            instruction,
+            custom_operations,
+            index_map,
+            use_symengine,
+            version,
+            standalone_var_indices=standalone_var_indices,
         )
 
     with io.BytesIO() as custom_operations_buffer:
@@ -1145,6 +1294,7 @@ def write_circuit(
                         custom_operations,
                         use_symengine,
                         version,
+                        standalone_var_indices=standalone_var_indices,
                     )
                 )
 
@@ -1186,16 +1336,21 @@ def read_circuit(file_obj, version, metadata_deserializer=None, use_symengine=Fa
     vectors = {}
     if version < 2:
         header, name, metadata = _read_header(file_obj, metadata_deserializer=metadata_deserializer)
-    else:
+    elif version < 12:
         header, name, metadata = _read_header_v2(
             file_obj, version, vectors, metadata_deserializer=metadata_deserializer
         )
+    else:
+        header, name, metadata = _read_header_v12(
+            file_obj, version, vectors, metadata_deserializer=metadata_deserializer
+        )
 
     global_phase = header["global_phase"]
     num_qubits = header["num_qubits"]
     num_clbits = header["num_clbits"]
     num_registers = header["num_registers"]
     num_instructions = header["num_instructions"]
+    num_vars = header.get("num_vars", 0)
     # `out_registers` is two "name: register" maps segregated by type for the rest of QPY, and
     # `all_registers` is the complete ordered list used to construct the `QuantumCircuit`.
     out_registers = {"q": {}, "c": {}}
@@ -1252,6 +1407,7 @@ def read_circuit(file_obj, version, metadata_deserializer=None, use_symengine=Fa
             "q": [Qubit() for _ in out_bits["q"]],
             "c": [Clbit() for _ in out_bits["c"]],
         }
+    var_segments, standalone_var_indices = value.read_standalone_vars(file_obj, num_vars)
     circ = QuantumCircuit(
         out_bits["q"],
         out_bits["c"],
@@ -1259,11 +1415,22 @@ def read_circuit(file_obj, version, metadata_deserializer=None, use_symengine=Fa
         name=name,
         global_phase=global_phase,
         metadata=metadata,
+        inputs=var_segments[type_keys.ExprVarDeclaration.INPUT],
+        captures=var_segments[type_keys.ExprVarDeclaration.CAPTURE],
     )
+    for declaration in var_segments[type_keys.ExprVarDeclaration.LOCAL]:
+        circ.add_uninitialized_var(declaration)
     custom_operations = _read_custom_operations(file_obj, version, vectors)
     for _instruction in range(num_instructions):
         _read_instruction(
-            file_obj, circ, out_registers, custom_operations, version, vectors, use_symengine
+            file_obj,
+            circ,
+            out_registers,
+            custom_operations,
+            version,
+            vectors,
+            use_symengine,
+            standalone_var_indices,
         )
 
     # Read calibrations
diff --git a/qiskit/qpy/binary_io/value.py b/qiskit/qpy/binary_io/value.py
index 1c11d4ad27c1..a3d7ff088139 100644
--- a/qiskit/qpy/binary_io/value.py
+++ b/qiskit/qpy/binary_io/value.py
@@ -95,11 +95,12 @@ def _write_parameter_expression(file_obj, obj, use_symengine):
 
 
 class _ExprWriter(expr.ExprVisitor[None]):
-    __slots__ = ("file_obj", "clbit_indices")
+    __slots__ = ("file_obj", "clbit_indices", "standalone_var_indices")
 
-    def __init__(self, file_obj, clbit_indices):
+    def __init__(self, file_obj, clbit_indices, standalone_var_indices):
         self.file_obj = file_obj
         self.clbit_indices = clbit_indices
+        self.standalone_var_indices = standalone_var_indices
 
     def visit_generic(self, node, /):
         raise exceptions.QpyError(f"unhandled Expr object '{node}'")
@@ -107,7 +108,15 @@ def visit_generic(self, node, /):
     def visit_var(self, node, /):
         self.file_obj.write(type_keys.Expression.VAR)
         _write_expr_type(self.file_obj, node.type)
-        if isinstance(node.var, Clbit):
+        if node.standalone:
+            self.file_obj.write(type_keys.ExprVar.UUID)
+            self.file_obj.write(
+                struct.pack(
+                    formats.EXPR_VAR_UUID_PACK,
+                    *formats.EXPR_VAR_UUID(self.standalone_var_indices[node]),
+                )
+            )
+        elif isinstance(node.var, Clbit):
             self.file_obj.write(type_keys.ExprVar.CLBIT)
             self.file_obj.write(
                 struct.pack(
@@ -178,8 +187,13 @@ def visit_binary(self, node, /):
         node.right.accept(self)
 
 
-def _write_expr(file_obj, node: expr.Expr, clbit_indices: collections.abc.Mapping[Clbit, int]):
-    node.accept(_ExprWriter(file_obj, clbit_indices))
+def _write_expr(
+    file_obj,
+    node: expr.Expr,
+    clbit_indices: collections.abc.Mapping[Clbit, int],
+    standalone_var_indices: collections.abc.Mapping[expr.Var, int],
+):
+    node.accept(_ExprWriter(file_obj, clbit_indices, standalone_var_indices))
 
 
 def _write_expr_type(file_obj, type_: types.Type):
@@ -315,12 +329,18 @@ def _read_expr(
     file_obj,
     clbits: collections.abc.Sequence[Clbit],
     cregs: collections.abc.Mapping[str, ClassicalRegister],
+    standalone_vars: collections.abc.Sequence[expr.Var],
 ) -> expr.Expr:
     # pylint: disable=too-many-return-statements
     type_key = file_obj.read(formats.EXPRESSION_DISCRIMINATOR_SIZE)
     type_ = _read_expr_type(file_obj)
     if type_key == type_keys.Expression.VAR:
         var_type_key = file_obj.read(formats.EXPR_VAR_DISCRIMINATOR_SIZE)
+        if var_type_key == type_keys.ExprVar.UUID:
+            payload = formats.EXPR_VAR_UUID._make(
+                struct.unpack(formats.EXPR_VAR_UUID_PACK, file_obj.read(formats.EXPR_VAR_UUID_SIZE))
+            )
+            return standalone_vars[payload.var_index]
         if var_type_key == type_keys.ExprVar.CLBIT:
             payload = formats.EXPR_VAR_CLBIT._make(
                 struct.unpack(
@@ -360,14 +380,20 @@ def _read_expr(
         payload = formats.EXPRESSION_CAST._make(
             struct.unpack(formats.EXPRESSION_CAST_PACK, file_obj.read(formats.EXPRESSION_CAST_SIZE))
         )
-        return expr.Cast(_read_expr(file_obj, clbits, cregs), type_, implicit=payload.implicit)
+        return expr.Cast(
+            _read_expr(file_obj, clbits, cregs, standalone_vars), type_, implicit=payload.implicit
+        )
     if type_key == type_keys.Expression.UNARY:
         payload = formats.EXPRESSION_UNARY._make(
             struct.unpack(
                 formats.EXPRESSION_UNARY_PACK, file_obj.read(formats.EXPRESSION_UNARY_SIZE)
             )
         )
-        return expr.Unary(expr.Unary.Op(payload.opcode), _read_expr(file_obj, clbits, cregs), type_)
+        return expr.Unary(
+            expr.Unary.Op(payload.opcode),
+            _read_expr(file_obj, clbits, cregs, standalone_vars),
+            type_,
+        )
     if type_key == type_keys.Expression.BINARY:
         payload = formats.EXPRESSION_BINARY._make(
             struct.unpack(
@@ -376,8 +402,8 @@ def _read_expr(
         )
         return expr.Binary(
             expr.Binary.Op(payload.opcode),
-            _read_expr(file_obj, clbits, cregs),
-            _read_expr(file_obj, clbits, cregs),
+            _read_expr(file_obj, clbits, cregs, standalone_vars),
+            _read_expr(file_obj, clbits, cregs, standalone_vars),
             type_,
         )
     raise exceptions.QpyError("Invalid classical-expression Expr key '{type_key}'")
@@ -395,7 +421,80 @@ def _read_expr_type(file_obj) -> types.Type:
     raise exceptions.QpyError(f"Invalid classical-expression Type key '{type_key}'")
 
 
-def dumps_value(obj, *, index_map=None, use_symengine=False):
+def read_standalone_vars(file_obj, num_vars):
+    """Read the ``num_vars`` standalone variable declarations from the file.
+
+    Args:
+        file_obj (File): a file-like object to read from.
+        num_vars (int): the number of variables to read.
+
+    Returns:
+        tuple[dict, list]: the first item is a mapping of the ``ExprVarDeclaration`` type keys to
+        the variables defined by that type key, and the second is the total order of variable
+        declarations.
+    """
+    read_vars = {
+        type_keys.ExprVarDeclaration.INPUT: [],
+        type_keys.ExprVarDeclaration.CAPTURE: [],
+        type_keys.ExprVarDeclaration.LOCAL: [],
+    }
+    var_order = []
+    for _ in range(num_vars):
+        data = formats.EXPR_VAR_DECLARATION._make(
+            struct.unpack(
+                formats.EXPR_VAR_DECLARATION_PACK,
+                file_obj.read(formats.EXPR_VAR_DECLARATION_SIZE),
+            )
+        )
+        type_ = _read_expr_type(file_obj)
+        name = file_obj.read(data.name_size).decode(common.ENCODE)
+        var = expr.Var(uuid.UUID(bytes=data.uuid_bytes), type_, name=name)
+        read_vars[data.usage].append(var)
+        var_order.append(var)
+    return read_vars, var_order
+
+
+def _write_standalone_var(file_obj, var, type_key):
+    name = var.name.encode(common.ENCODE)
+    file_obj.write(
+        struct.pack(
+            formats.EXPR_VAR_DECLARATION_PACK,
+            *formats.EXPR_VAR_DECLARATION(var.var.bytes, type_key, len(name)),
+        )
+    )
+    _write_expr_type(file_obj, var.type)
+    file_obj.write(name)
+
+
+def write_standalone_vars(file_obj, circuit):
+    """Write the standalone variables out from a circuit.
+
+    Args:
+        file_obj (File): the file-like object to write to.
+        circuit (QuantumCircuit): the circuit to take the variables from.
+
+    Returns:
+        dict[expr.Var, int]: a mapping of the variables written to the index that they were written
+        at.
+    """
+    index = 0
+    out = {}
+    for var in circuit.iter_input_vars():
+        _write_standalone_var(file_obj, var, type_keys.ExprVarDeclaration.INPUT)
+        out[var] = index
+        index += 1
+    for var in circuit.iter_captured_vars():
+        _write_standalone_var(file_obj, var, type_keys.ExprVarDeclaration.CAPTURE)
+        out[var] = index
+        index += 1
+    for var in circuit.iter_declared_vars():
+        _write_standalone_var(file_obj, var, type_keys.ExprVarDeclaration.LOCAL)
+        out[var] = index
+        index += 1
+    return out
+
+
+def dumps_value(obj, *, index_map=None, use_symengine=False, standalone_var_indices=None):
     """Serialize input value object.
 
     Args:
@@ -407,6 +506,8 @@ def dumps_value(obj, *, index_map=None, use_symengine=False):
             native mechanism. This is a faster serialization alternative, but not supported in all
             platforms. Please check that your target platform is supported by the symengine library
             before setting this option, as it will be required by qpy to deserialize the payload.
+        standalone_var_indices (dict): Dictionary that maps standalone :class:`.expr.Var` entries to
+            the index that should be used to refer to them.
 
     Returns:
         tuple: TypeKey and binary data.
@@ -438,14 +539,20 @@ def dumps_value(obj, *, index_map=None, use_symengine=False):
         )
     elif type_key == type_keys.Value.EXPRESSION:
         clbit_indices = {} if index_map is None else index_map["c"]
-        binary_data = common.data_to_binary(obj, _write_expr, clbit_indices=clbit_indices)
+        standalone_var_indices = {} if standalone_var_indices is None else standalone_var_indices
+        binary_data = common.data_to_binary(
+            obj,
+            _write_expr,
+            clbit_indices=clbit_indices,
+            standalone_var_indices=standalone_var_indices,
+        )
     else:
         raise exceptions.QpyError(f"Serialization for {type_key} is not implemented in value I/O.")
 
     return type_key, binary_data
 
 
-def write_value(file_obj, obj, *, index_map=None, use_symengine=False):
+def write_value(file_obj, obj, *, index_map=None, use_symengine=False, standalone_var_indices=None):
     """Write a value to the file like object.
 
     Args:
@@ -458,13 +565,28 @@ def write_value(file_obj, obj, *, index_map=None, use_symengine=False):
             native mechanism. This is a faster serialization alternative, but not supported in all
             platforms. Please check that your target platform is supported by the symengine library
             before setting this option, as it will be required by qpy to deserialize the payload.
+        standalone_var_indices (dict): Dictionary that maps standalone :class:`.expr.Var` entries to
+            the index that should be used to refer to them.
     """
-    type_key, data = dumps_value(obj, index_map=index_map, use_symengine=use_symengine)
+    type_key, data = dumps_value(
+        obj,
+        index_map=index_map,
+        use_symengine=use_symengine,
+        standalone_var_indices=standalone_var_indices,
+    )
     common.write_generic_typed_data(file_obj, type_key, data)
 
 
 def loads_value(
-    type_key, binary_data, version, vectors, *, clbits=(), cregs=None, use_symengine=False
+    type_key,
+    binary_data,
+    version,
+    vectors,
+    *,
+    clbits=(),
+    cregs=None,
+    use_symengine=False,
+    standalone_vars=(),
 ):
     """Deserialize input binary data to value object.
 
@@ -479,6 +601,8 @@ def loads_value(
             native mechanism. This is a faster serialization alternative, but not supported in all
             platforms. Please check that your target platform is supported by the symengine library
             before setting this option, as it will be required by qpy to deserialize the payload.
+        standalone_vars (Sequence[Var]): standalone :class:`.expr.Var` nodes in the order that they
+            were declared by the circuit header.
 
     Returns:
         any: Deserialized value object.
@@ -520,12 +644,27 @@ def loads_value(
                 use_symengine=use_symengine,
             )
     if type_key == type_keys.Value.EXPRESSION:
-        return common.data_from_binary(binary_data, _read_expr, clbits=clbits, cregs=cregs or {})
+        return common.data_from_binary(
+            binary_data,
+            _read_expr,
+            clbits=clbits,
+            cregs=cregs or {},
+            standalone_vars=standalone_vars,
+        )
 
     raise exceptions.QpyError(f"Serialization for {type_key} is not implemented in value I/O.")
 
 
-def read_value(file_obj, version, vectors, *, clbits=(), cregs=None, use_symengine=False):
+def read_value(
+    file_obj,
+    version,
+    vectors,
+    *,
+    clbits=(),
+    cregs=None,
+    use_symengine=False,
+    standalone_vars=(),
+):
     """Read a value from the file like object.
 
     Args:
@@ -538,6 +677,8 @@ def read_value(file_obj, version, vectors, *, clbits=(), cregs=None, use_symengi
             native mechanism. This is a faster serialization alternative, but not supported in all
             platforms. Please check that your target platform is supported by the symengine library
             before setting this option, as it will be required by qpy to deserialize the payload.
+        standalone_vars (Sequence[expr.Var]): standalone variables in the order they were defined in
+            the QPY payload.
 
     Returns:
         any: Deserialized value object.
@@ -545,5 +686,12 @@ def read_value(file_obj, version, vectors, *, clbits=(), cregs=None, use_symengi
     type_key, data = common.read_generic_typed_data(file_obj)
 
     return loads_value(
-        type_key, data, version, vectors, clbits=clbits, cregs=cregs, use_symengine=use_symengine
+        type_key,
+        data,
+        version,
+        vectors,
+        clbits=clbits,
+        cregs=cregs,
+        use_symengine=use_symengine,
+        standalone_vars=standalone_vars,
     )
diff --git a/qiskit/qpy/common.py b/qiskit/qpy/common.py
index 7cc11fb7ca05..048320d5cad6 100644
--- a/qiskit/qpy/common.py
+++ b/qiskit/qpy/common.py
@@ -20,7 +20,7 @@
 
 from qiskit.qpy import formats
 
-QPY_VERSION = 11
+QPY_VERSION = 12
 QPY_COMPATIBILITY_VERSION = 10
 ENCODE = "utf8"
 
diff --git a/qiskit/qpy/exceptions.py b/qiskit/qpy/exceptions.py
index c6cdb4303a62..5662e6029373 100644
--- a/qiskit/qpy/exceptions.py
+++ b/qiskit/qpy/exceptions.py
@@ -28,6 +28,26 @@ def __str__(self):
         return repr(self.message)
 
 
+class UnsupportedFeatureForVersion(QpyError):
+    """QPY error raised when the target dump version is too low for a feature that is present in the
+    object to be serialized."""
+
+    def __init__(self, feature: str, required: int, target: int):
+        """
+        Args:
+            feature: a description of the problematic feature.
+            required: the minimum version of QPY that would be required to represent this
+                feature.
+            target: the version of QPY that is being used in the serialization.
+        """
+        self.feature = feature
+        self.required = required
+        self.target = target
+        super().__init__(
+            f"Dumping QPY version {target}, but version {required} is required for: {feature}."
+        )
+
+
 class QPYLoadingDeprecatedFeatureWarning(QiskitWarning):
     """Visible deprecation warning for QPY loading functions without
     a stable point in the call stack."""
diff --git a/qiskit/qpy/formats.py b/qiskit/qpy/formats.py
index 958bebd8dad9..a48a9ea777fa 100644
--- a/qiskit/qpy/formats.py
+++ b/qiskit/qpy/formats.py
@@ -42,6 +42,24 @@
 FILE_HEADER_PACK = "!6sBBBBQ"
 FILE_HEADER_SIZE = struct.calcsize(FILE_HEADER_PACK)
 
+
+CIRCUIT_HEADER_V12 = namedtuple(
+    "HEADER",
+    [
+        "name_size",
+        "global_phase_type",
+        "global_phase_size",
+        "num_qubits",
+        "num_clbits",
+        "metadata_size",
+        "num_registers",
+        "num_instructions",
+        "num_vars",
+    ],
+)
+CIRCUIT_HEADER_V12_PACK = "!H1cHIIQIQI"
+CIRCUIT_HEADER_V12_SIZE = struct.calcsize(CIRCUIT_HEADER_V12_PACK)
+
 # CIRCUIT_HEADER_V2
 CIRCUIT_HEADER_V2 = namedtuple(
     "HEADER",
@@ -309,6 +327,13 @@
 INITIAL_LAYOUT_BIT_PACK = "!ii"
 INITIAL_LAYOUT_BIT_SIZE = struct.calcsize(INITIAL_LAYOUT_BIT_PACK)
 
+# EXPR_VAR_DECLARATION
+
+EXPR_VAR_DECLARATION = namedtuple("EXPR_VAR_DECLARATION", ["uuid_bytes", "usage", "name_size"])
+EXPR_VAR_DECLARATION_PACK = "!16scH"
+EXPR_VAR_DECLARATION_SIZE = struct.calcsize(EXPR_VAR_DECLARATION_PACK)
+
+
 # EXPRESSION
 
 EXPRESSION_DISCRIMINATOR_SIZE = 1
@@ -351,6 +376,10 @@
 EXPR_VAR_REGISTER_PACK = "!H"
 EXPR_VAR_REGISTER_SIZE = struct.calcsize(EXPR_VAR_REGISTER_PACK)
 
+EXPR_VAR_UUID = namedtuple("EXPR_VAR_UUID", ["var_index"])
+EXPR_VAR_UUID_PACK = "!H"
+EXPR_VAR_UUID_SIZE = struct.calcsize(EXPR_VAR_UUID_PACK)
+
 
 # EXPR_VALUE
 
diff --git a/qiskit/qpy/type_keys.py b/qiskit/qpy/type_keys.py
index dd0e7fe22693..6ec85115b559 100644
--- a/qiskit/qpy/type_keys.py
+++ b/qiskit/qpy/type_keys.py
@@ -16,6 +16,7 @@
 QPY Type keys for several namespace.
 """
 
+import uuid
 from abc import abstractmethod
 from enum import Enum, IntEnum
 
@@ -471,6 +472,22 @@ def retrieve(cls, type_key):
         raise NotImplementedError
 
 
+class ExprVarDeclaration(TypeKeyBase):
+    """Type keys for the ``EXPR_VAR_DECLARATION`` QPY item."""
+
+    INPUT = b"I"
+    CAPTURE = b"C"
+    LOCAL = b"L"
+
+    @classmethod
+    def assign(cls, obj):
+        raise NotImplementedError
+
+    @classmethod
+    def retrieve(cls, type_key):
+        raise NotImplementedError
+
+
 class ExprType(TypeKeyBase):
     """Type keys for the ``EXPR_TYPE`` QPY item."""
 
@@ -496,9 +513,12 @@ class ExprVar(TypeKeyBase):
 
     CLBIT = b"C"
     REGISTER = b"R"
+    UUID = b"U"
 
     @classmethod
     def assign(cls, obj):
+        if isinstance(obj, uuid.UUID):
+            return cls.UUID
         if isinstance(obj, Clbit):
             return cls.CLBIT
         if isinstance(obj, ClassicalRegister):
diff --git a/qiskit/quantum_info/operators/channel/chi.py b/qiskit/quantum_info/operators/channel/chi.py
index 7cd5fce5258f..ee0ddaa45385 100644
--- a/qiskit/quantum_info/operators/channel/chi.py
+++ b/qiskit/quantum_info/operators/channel/chi.py
@@ -16,10 +16,11 @@
 """
 
 from __future__ import annotations
-import copy
+import copy as _copy
 import math
 import numpy as np
 
+from qiskit import _numpy_compat
 from qiskit.circuit.quantumcircuit import QuantumCircuit
 from qiskit.circuit.instruction import Instruction
 from qiskit.exceptions import QiskitError
@@ -131,10 +132,9 @@ def __init__(
             raise QiskitError("Input is not an n-qubit Chi matrix.")
         super().__init__(chi_mat, num_qubits=num_qubits)
 
-    def __array__(self, dtype=None):
-        if dtype:
-            return np.asarray(self.data, dtype=dtype)
-        return self.data
+    def __array__(self, dtype=None, copy=_numpy_compat.COPY_ONLY_IF_NEEDED):
+        dtype = self.data.dtype
+        return np.array(self.data, dtype=dtype, copy=copy)
 
     @property
     def _bipartite_shape(self):
@@ -181,7 +181,7 @@ def expand(self, other: Chi) -> Chi:
 
     @classmethod
     def _tensor(cls, a, b):
-        ret = copy.copy(a)
+        ret = _copy.copy(a)
         ret._op_shape = a._op_shape.tensor(b._op_shape)
         ret._data = np.kron(a._data, b.data)
         return ret
diff --git a/qiskit/quantum_info/operators/channel/choi.py b/qiskit/quantum_info/operators/channel/choi.py
index afd8e4fca6f9..1c9579e45602 100644
--- a/qiskit/quantum_info/operators/channel/choi.py
+++ b/qiskit/quantum_info/operators/channel/choi.py
@@ -16,10 +16,11 @@
 """
 
 from __future__ import annotations
-import copy
+import copy as _copy
 import math
 import numpy as np
 
+from qiskit import _numpy_compat
 from qiskit.circuit.quantumcircuit import QuantumCircuit
 from qiskit.circuit.instruction import Instruction
 from qiskit.exceptions import QiskitError
@@ -134,10 +135,9 @@ def __init__(
             choi_mat = _to_choi(rep, data._data, input_dim, output_dim)
         super().__init__(choi_mat, op_shape=op_shape)
 
-    def __array__(self, dtype=None):
-        if dtype:
-            return np.asarray(self.data, dtype=dtype)
-        return self.data
+    def __array__(self, dtype=None, copy=_numpy_compat.COPY_ONLY_IF_NEEDED):
+        dtype = self.data.dtype if dtype is None else dtype
+        return np.array(self.data, dtype=dtype, copy=copy)
 
     @property
     def _bipartite_shape(self):
@@ -152,12 +152,12 @@ def _evolve(self, state, qargs=None):
     # ---------------------------------------------------------------------
 
     def conjugate(self):
-        ret = copy.copy(self)
+        ret = _copy.copy(self)
         ret._data = np.conj(self._data)
         return ret
 
     def transpose(self):
-        ret = copy.copy(self)
+        ret = _copy.copy(self)
         ret._op_shape = self._op_shape.transpose()
         # Make bipartite matrix
         d_in, d_out = self.dim
@@ -206,7 +206,7 @@ def expand(self, other: Choi) -> Choi:
 
     @classmethod
     def _tensor(cls, a, b):
-        ret = copy.copy(a)
+        ret = _copy.copy(a)
         ret._op_shape = a._op_shape.tensor(b._op_shape)
         ret._data = _bipartite_tensor(
             a._data, b.data, shape1=a._bipartite_shape, shape2=b._bipartite_shape
diff --git a/qiskit/quantum_info/operators/channel/ptm.py b/qiskit/quantum_info/operators/channel/ptm.py
index 1bdf1b5ef235..84db071121f3 100644
--- a/qiskit/quantum_info/operators/channel/ptm.py
+++ b/qiskit/quantum_info/operators/channel/ptm.py
@@ -16,10 +16,11 @@
 """
 
 from __future__ import annotations
-import copy
+import copy as _copy
 import math
 import numpy as np
 
+from qiskit import _numpy_compat
 from qiskit.circuit.quantumcircuit import QuantumCircuit
 from qiskit.circuit.instruction import Instruction
 from qiskit.exceptions import QiskitError
@@ -133,10 +134,9 @@ def __init__(
             raise QiskitError("Input is not an n-qubit Pauli transfer matrix.")
         super().__init__(ptm, num_qubits=num_qubits)
 
-    def __array__(self, dtype=None):
-        if dtype:
-            np.asarray(self.data, dtype=dtype)
-        return self.data
+    def __array__(self, dtype=None, copy=_numpy_compat.COPY_ONLY_IF_NEEDED):
+        dtype = self.data.dtype if dtype is None else dtype
+        return np.array(self.data, dtype=dtype, copy=copy)
 
     @property
     def _bipartite_shape(self):
@@ -194,7 +194,7 @@ def expand(self, other: PTM) -> PTM:
 
     @classmethod
     def _tensor(cls, a, b):
-        ret = copy.copy(a)
+        ret = _copy.copy(a)
         ret._op_shape = a._op_shape.tensor(b._op_shape)
         ret._data = np.kron(a._data, b.data)
         return ret
diff --git a/qiskit/quantum_info/operators/channel/superop.py b/qiskit/quantum_info/operators/channel/superop.py
index 0d7116ef5069..19867696ec6a 100644
--- a/qiskit/quantum_info/operators/channel/superop.py
+++ b/qiskit/quantum_info/operators/channel/superop.py
@@ -15,12 +15,13 @@
 
 from __future__ import annotations
 
-import copy
+import copy as _copy
 import math
 from typing import TYPE_CHECKING
 
 import numpy as np
 
+from qiskit import _numpy_compat
 from qiskit.circuit.instruction import Instruction
 from qiskit.circuit.quantumcircuit import QuantumCircuit
 from qiskit.exceptions import QiskitError
@@ -127,10 +128,9 @@ def __init__(
         # Initialize QuantumChannel
         super().__init__(super_mat, op_shape=op_shape)
 
-    def __array__(self, dtype=None):
-        if dtype:
-            return np.asarray(self.data, dtype=dtype)
-        return self.data
+    def __array__(self, dtype=None, copy=_numpy_compat.COPY_ONLY_IF_NEEDED):
+        dtype = self.data.dtype if dtype is None else dtype
+        return np.array(self.data, dtype=dtype, copy=copy)
 
     @property
     def _tensor_shape(self):
@@ -149,18 +149,18 @@ def _bipartite_shape(self):
     # ---------------------------------------------------------------------
 
     def conjugate(self):
-        ret = copy.copy(self)
+        ret = _copy.copy(self)
         ret._data = np.conj(self._data)
         return ret
 
     def transpose(self):
-        ret = copy.copy(self)
+        ret = _copy.copy(self)
         ret._data = np.transpose(self._data)
         ret._op_shape = self._op_shape.transpose()
         return ret
 
     def adjoint(self):
-        ret = copy.copy(self)
+        ret = _copy.copy(self)
         ret._data = np.conj(np.transpose(self._data))
         ret._op_shape = self._op_shape.transpose()
         return ret
@@ -177,7 +177,7 @@ def expand(self, other: SuperOp) -> SuperOp:
 
     @classmethod
     def _tensor(cls, a, b):
-        ret = copy.copy(a)
+        ret = _copy.copy(a)
         ret._op_shape = a._op_shape.tensor(b._op_shape)
         ret._data = _bipartite_tensor(
             a._data, b.data, shape1=a._bipartite_shape, shape2=b._bipartite_shape
diff --git a/qiskit/quantum_info/operators/dihedral/dihedral.py b/qiskit/quantum_info/operators/dihedral/dihedral.py
index af15e0ed3ae5..4f49879063ec 100644
--- a/qiskit/quantum_info/operators/dihedral/dihedral.py
+++ b/qiskit/quantum_info/operators/dihedral/dihedral.py
@@ -357,10 +357,11 @@ def _from_circuit(self, circuit):
         _append_circuit(elem, circuit)
         return elem
 
-    def __array__(self, dtype=None):
-        if dtype:
-            return np.asarray(self.to_matrix(), dtype=dtype)
-        return self.to_matrix()
+    def __array__(self, dtype=None, copy=None):
+        if copy is False:
+            raise ValueError("unable to avoid copy while creating an array as requested")
+        arr = self.to_matrix()
+        return arr if dtype is None else arr.astype(dtype, copy=False)
 
     def to_matrix(self):
         """Convert operator to Numpy matrix."""
diff --git a/qiskit/quantum_info/operators/operator.py b/qiskit/quantum_info/operators/operator.py
index 90d40b93cc50..41eac3563576 100644
--- a/qiskit/quantum_info/operators/operator.py
+++ b/qiskit/quantum_info/operators/operator.py
@@ -16,13 +16,14 @@
 
 from __future__ import annotations
 
-import copy
+import copy as _copy
 import re
 from numbers import Number
 from typing import TYPE_CHECKING
 
 import numpy as np
 
+from qiskit import _numpy_compat
 from qiskit.circuit.instruction import Instruction
 from qiskit.circuit.library.standard_gates import HGate, IGate, SGate, TGate, XGate, YGate, ZGate
 from qiskit.circuit.operation import Operation
@@ -81,6 +82,9 @@ def __init__(
             a Numpy array of shape (2**N, 2**N) qubit systems will be used. If
             the input operator is not an N-qubit operator, it will assign a
             single subsystem with dimension specified by the shape of the input.
+            Note that two operators initialized via this method are only considered equivalent if they
+            match up to their canonical qubit order (or: permutation). See :meth:`.Operator.from_circuit`
+            to specify a different qubit permutation.
         """
         op_shape = None
         if isinstance(data, (list, np.ndarray)):
@@ -117,10 +121,9 @@ def __init__(
             shape=self._data.shape,
         )
 
-    def __array__(self, dtype=None):
-        if dtype:
-            return np.asarray(self.data, dtype=dtype)
-        return self.data
+    def __array__(self, dtype=None, copy=_numpy_compat.COPY_ONLY_IF_NEEDED):
+        dtype = self.data.dtype if dtype is None else dtype
+        return np.array(self.data, dtype=dtype, copy=copy)
 
     def __repr__(self):
         prefix = "Operator("
@@ -391,8 +394,7 @@ def from_circuit(
         Returns:
             Operator: An operator representing the input circuit
         """
-        dimension = 2**circuit.num_qubits
-        op = cls(np.eye(dimension))
+
         if layout is None:
             if not ignore_set_layout:
                 layout = getattr(circuit, "_layout", None)
@@ -403,27 +405,38 @@ def from_circuit(
                 initial_layout=layout,
                 input_qubit_mapping={qubit: index for index, qubit in enumerate(circuit.qubits)},
             )
+
+        initial_layout = layout.initial_layout if layout is not None else None
+
         if final_layout is None:
             if not ignore_set_layout and layout is not None:
                 final_layout = getattr(layout, "final_layout", None)
 
-        qargs = None
-        # If there was a layout specified (either from the circuit
-        # or via user input) use that to set qargs to permute qubits
-        # based on that layout
-        if layout is not None:
-            physical_to_virtual = layout.initial_layout.get_physical_bits()
-            qargs = [
-                layout.input_qubit_mapping[physical_to_virtual[physical_bit]]
-                for physical_bit in range(len(physical_to_virtual))
-            ]
-        # Convert circuit to an instruction
-        instruction = circuit.to_instruction()
-        op._append_instruction(instruction, qargs=qargs)
-        # If final layout is set permute output indices based on layout
+        from qiskit.synthesis.permutation.permutation_utils import _inverse_pattern
+
+        if initial_layout is not None:
+            input_qubits = [None] * len(layout.input_qubit_mapping)
+            for q, p in layout.input_qubit_mapping.items():
+                input_qubits[p] = q
+
+            initial_permutation = initial_layout.to_permutation(input_qubits)
+            initial_permutation_inverse = _inverse_pattern(initial_permutation)
+
         if final_layout is not None:
-            perm_pattern = [final_layout._v2p[v] for v in circuit.qubits]
-            op = op.apply_permutation(perm_pattern, front=False)
+            final_permutation = final_layout.to_permutation(circuit.qubits)
+            final_permutation_inverse = _inverse_pattern(final_permutation)
+
+        op = Operator(circuit)
+
+        if initial_layout:
+            op = op.apply_permutation(initial_permutation, True)
+
+        if final_layout:
+            op = op.apply_permutation(final_permutation_inverse, False)
+
+        if initial_layout:
+            op = op.apply_permutation(initial_permutation_inverse, False)
+
         return op
 
     def is_unitary(self, atol=None, rtol=None):
@@ -447,13 +460,13 @@ def to_instruction(self):
 
     def conjugate(self):
         # Make a shallow copy and update array
-        ret = copy.copy(self)
+        ret = _copy.copy(self)
         ret._data = np.conj(self._data)
         return ret
 
     def transpose(self):
         # Make a shallow copy and update array
-        ret = copy.copy(self)
+        ret = _copy.copy(self)
         ret._data = np.transpose(self._data)
         ret._op_shape = self._op_shape.transpose()
         return ret
@@ -523,7 +536,7 @@ def power(self, n: float) -> Operator:
         """
         if self.input_dims() != self.output_dims():
             raise QiskitError("Can only power with input_dims = output_dims.")
-        ret = copy.copy(self)
+        ret = _copy.copy(self)
         if isinstance(n, int):
             ret._data = np.linalg.matrix_power(self.data, n)
         else:
@@ -550,7 +563,7 @@ def expand(self, other: Operator) -> Operator:
 
     @classmethod
     def _tensor(cls, a, b):
-        ret = copy.copy(a)
+        ret = _copy.copy(a)
         ret._op_shape = a._op_shape.tensor(b._op_shape)
         ret._data = np.kron(a.data, b.data)
         return ret
@@ -585,7 +598,7 @@ def _add(self, other, qargs=None):
         self._op_shape._validate_add(other._op_shape, qargs)
         other = ScalarOp._pad_with_identity(self, other, qargs)
 
-        ret = copy.copy(self)
+        ret = _copy.copy(self)
         ret._data = self.data + other.data
         return ret
 
@@ -603,7 +616,7 @@ def _multiply(self, other):
         """
         if not isinstance(other, Number):
             raise QiskitError("other is not a number")
-        ret = copy.copy(self)
+        ret = _copy.copy(self)
         ret._data = other * self._data
         return ret
 
@@ -643,7 +656,7 @@ def reverse_qargs(self) -> Operator:
         Returns:
             Operator: the operator with reversed subsystem order.
         """
-        ret = copy.copy(self)
+        ret = _copy.copy(self)
         axes = tuple(range(self._op_shape._num_qargs_l - 1, -1, -1))
         axes = axes + tuple(len(axes) + i for i in axes)
         ret._data = np.reshape(
diff --git a/qiskit/quantum_info/operators/scalar_op.py b/qiskit/quantum_info/operators/scalar_op.py
index d856a39c2a20..38f36193739b 100644
--- a/qiskit/quantum_info/operators/scalar_op.py
+++ b/qiskit/quantum_info/operators/scalar_op.py
@@ -15,7 +15,7 @@
 """
 
 from __future__ import annotations
-import copy
+import copy as _copy
 from numbers import Number
 import numpy as np
 
@@ -52,10 +52,11 @@ def __init__(self, dims: int | tuple | None = None, coeff: Number = 1):
         self._coeff = coeff
         super().__init__(input_dims=dims, output_dims=dims)
 
-    def __array__(self, dtype=None):
-        if dtype:
-            return np.asarray(self.to_matrix(), dtype=dtype)
-        return self.to_matrix()
+    def __array__(self, dtype=None, copy=None):
+        if copy is False:
+            raise ValueError("could not produce matrix without calculation")
+        arr = self.to_matrix()
+        return arr if dtype is None else arr.astype(dtype, copy=False)
 
     def __repr__(self):
         return f"ScalarOp({self.input_dims()}, coeff={self.coeff})"
@@ -104,7 +105,7 @@ def compose(self, other: ScalarOp, qargs: list | None = None, front: bool = Fals
         # If other is also an ScalarOp we only need to
         # update the coefficient and dimensions
         if isinstance(other, ScalarOp):
-            ret = copy.copy(self)
+            ret = _copy.copy(self)
             ret._coeff = self.coeff * other.coeff
             ret._op_shape = new_shape
             return ret
@@ -112,7 +113,7 @@ def compose(self, other: ScalarOp, qargs: list | None = None, front: bool = Fals
         # If we are composing on the full system we return the
         # other operator with reshaped dimensions
         if qargs is None:
-            ret = copy.copy(other)
+            ret = _copy.copy(other)
             ret._op_shape = new_shape
             # Other operator might not support scalar multiplication
             # so we treat the identity as a special case to avoid a
@@ -148,7 +149,7 @@ def tensor(self, other: ScalarOp) -> ScalarOp:
             other = Operator(other)
 
         if isinstance(other, ScalarOp):
-            ret = copy.copy(self)
+            ret = _copy.copy(self)
             ret._coeff = self.coeff * other.coeff
             ret._op_shape = self._op_shape.tensor(other._op_shape)
             return ret
@@ -160,7 +161,7 @@ def expand(self, other: ScalarOp) -> ScalarOp:
             other = Operator(other)
 
         if isinstance(other, ScalarOp):
-            ret = copy.copy(self)
+            ret = _copy.copy(self)
             ret._coeff = self.coeff * other.coeff
             ret._op_shape = self._op_shape.expand(other._op_shape)
             return ret
diff --git a/qiskit/quantum_info/operators/symplectic/clifford.py b/qiskit/quantum_info/operators/symplectic/clifford.py
index f0d30cf59f0c..9a5e8732ae68 100644
--- a/qiskit/quantum_info/operators/symplectic/clifford.py
+++ b/qiskit/quantum_info/operators/symplectic/clifford.py
@@ -37,7 +37,23 @@
 
 
 class Clifford(BaseOperator, AdjointMixin, Operation):
-    """An N-qubit unitary operator from the Clifford group.
+    r"""
+    An N-qubit unitary operator from the Clifford group.
+
+    An N-qubit Clifford operator takes Paulis to Paulis via conjugation
+    (up to a global phase). More precisely, the Clifford group :math:`\mathcal{C}_N`
+    is defined as
+
+     .. math::
+
+        \mathcal{C}_N = \{ U \in U(2^N) | U \mathcal{P}_N U^{\dagger} = \mathcal{P}_N \} / U(1)
+
+     where :math:`\mathcal{P}_N` is the Pauli group on :math:`N` qubits
+     that is generated by single-qubit Pauli operators,
+     and :math:`U` is a unitary operator in the unitary group
+     :math:`U(2^N)` representing operations on :math:`N` qubits.
+     :math:`\mathcal{C}_N` is the quotient group by the subgroup of
+     scalar unitary matrices :math:`U(1)`.
 
     **Representation**
 
@@ -91,7 +107,7 @@ class Clifford(BaseOperator, AdjointMixin, Operation):
     :class:`~qiskit.circuit.library.SGate`, :class:`~qiskit.circuit.library.SdgGate`,
     :class:`~qiskit.circuit.library.SXGate`, :class:`~qiskit.circuit.library.SXdgGate`,
     :class:`~qiskit.circuit.library.CXGate`, :class:`~qiskit.circuit.library.CZGate`,
-    :class:`~qiskit.circuit.library.CYGate`, :class:`~qiskit.circuit.library.DXGate`,
+    :class:`~qiskit.circuit.library.CYGate`, :class:`~qiskit.circuit.library.DCXGate`,
     :class:`~qiskit.circuit.library.SwapGate`, :class:`~qiskit.circuit.library.iSwapGate`,
     :class:`~qiskit.circuit.library.ECRGate`, :class:`~qiskit.circuit.library.LinearFunction`,
     :class:`~qiskit.circuit.library.PermutationGate`.
@@ -122,10 +138,11 @@ class Clifford(BaseOperator, AdjointMixin, Operation):
     _COMPOSE_PHASE_LOOKUP = None
     _COMPOSE_1Q_LOOKUP = None
 
-    def __array__(self, dtype=None):
-        if dtype:
-            return np.asarray(self.to_matrix(), dtype=dtype)
-        return self.to_matrix()
+    def __array__(self, dtype=None, copy=None):
+        if copy is False:
+            raise ValueError("unable to avoid copy while creating an array as requested")
+        arr = self.to_matrix()
+        return arr if dtype is None else arr.astype(dtype, copy=False)
 
     def __init__(self, data, validate=True, copy=True):
         """Initialize an operator object."""
@@ -164,8 +181,17 @@ def __init__(self, data, validate=True, copy=True):
 
         # Initialize StabilizerTable directly from the data
         else:
-            if isinstance(data, (list, np.ndarray)) and np.asarray(data, dtype=bool).ndim == 2:
-                data = np.array(data, dtype=bool, copy=copy)
+            if (
+                isinstance(data, (list, np.ndarray))
+                and (data_asarray := np.asarray(data, dtype=bool)).ndim == 2
+            ):
+                # This little dance is to avoid Numpy 1/2 incompatiblities between the availability
+                # and meaning of the 'copy' argument in 'array' and 'asarray', when the input needs
+                # its dtype converting.  'asarray' prefers to return 'self' if possible in both.
+                if copy and np.may_share_memory(data, data_asarray):
+                    data = data_asarray.copy()
+                else:
+                    data = data_asarray
                 if data.shape[0] == data.shape[1]:
                     self.tableau = self._stack_table_phase(
                         data, np.zeros(data.shape[0], dtype=bool)
diff --git a/qiskit/quantum_info/operators/symplectic/pauli.py b/qiskit/quantum_info/operators/symplectic/pauli.py
index 1bdff0cf8fea..e1bcfa29ebcb 100644
--- a/qiskit/quantum_info/operators/symplectic/pauli.py
+++ b/qiskit/quantum_info/operators/symplectic/pauli.py
@@ -222,10 +222,11 @@ def __str__(self):
             return front + "..."
         return self.to_label()
 
-    def __array__(self, dtype=None):
-        if dtype:
-            return np.asarray(self.to_matrix(), dtype=dtype)
-        return self.to_matrix()
+    def __array__(self, dtype=None, copy=None):
+        if copy is False:
+            raise ValueError("unable to avoid copy while creating an array as requested")
+        arr = self.to_matrix()
+        return arr if dtype is None else arr.astype(dtype, copy=False)
 
     @classmethod
     def set_truncation(cls, val: int):
diff --git a/qiskit/quantum_info/operators/symplectic/pauli_list.py b/qiskit/quantum_info/operators/symplectic/pauli_list.py
index 1bb9ae3d1ada..3d348d236387 100644
--- a/qiskit/quantum_info/operators/symplectic/pauli_list.py
+++ b/qiskit/quantum_info/operators/symplectic/pauli_list.py
@@ -148,14 +148,15 @@ def settings(self):
         """Return settings."""
         return {"data": self.to_labels()}
 
-    def __array__(self, dtype=None):
+    def __array__(self, dtype=None, copy=None):
         """Convert to numpy array"""
-        # pylint: disable=unused-argument
+        if copy is False:
+            raise ValueError("cannot provide a matrix without calculation")
         shape = (len(self),) + 2 * (2**self.num_qubits,)
         ret = np.zeros(shape, dtype=complex)
         for i, mat in enumerate(self.matrix_iter()):
             ret[i] = mat
-        return ret
+        return ret if dtype is None else ret.astype(dtype, copy=False)
 
     @staticmethod
     def _from_paulis(data):
diff --git a/qiskit/quantum_info/operators/symplectic/random.py b/qiskit/quantum_info/operators/symplectic/random.py
index 1a845100b91f..f9bd65ef9187 100644
--- a/qiskit/quantum_info/operators/symplectic/random.py
+++ b/qiskit/quantum_info/operators/symplectic/random.py
@@ -81,7 +81,7 @@ def random_pauli_list(
     z = rng.integers(2, size=(size, num_qubits)).astype(bool)
     x = rng.integers(2, size=(size, num_qubits)).astype(bool)
     if phase:
-        _phase = rng.integers(4, size=(size))
+        _phase = rng.integers(4, size=size)
         return PauliList.from_symplectic(z, x, _phase)
     return PauliList.from_symplectic(z, x)
 
diff --git a/qiskit/quantum_info/operators/symplectic/sparse_pauli_op.py b/qiskit/quantum_info/operators/symplectic/sparse_pauli_op.py
index 0d29cd098dc3..dffe5b2396b2 100644
--- a/qiskit/quantum_info/operators/symplectic/sparse_pauli_op.py
+++ b/qiskit/quantum_info/operators/symplectic/sparse_pauli_op.py
@@ -23,7 +23,13 @@
 import numpy as np
 import rustworkx as rx
 
-from qiskit._accelerate.sparse_pauli_op import unordered_unique, decompose_dense
+from qiskit._accelerate.sparse_pauli_op import (
+    ZXPaulis,
+    decompose_dense,
+    to_matrix_dense,
+    to_matrix_sparse,
+    unordered_unique,
+)
 from qiskit.circuit.parameter import Parameter
 from qiskit.circuit.parameterexpression import ParameterExpression
 from qiskit.circuit.parametertable import ParameterView
@@ -142,7 +148,12 @@ def __init__(
         if coeffs is None:
             coeffs = np.ones(pauli_list.size, dtype=complex)
         else:
-            coeffs = np.array(coeffs, copy=copy, dtype=dtype)
+            coeffs_asarray = np.asarray(coeffs, dtype=dtype)
+            coeffs = (
+                coeffs_asarray.copy()
+                if copy and np.may_share_memory(coeffs, coeffs_asarray)
+                else coeffs_asarray
+            )
 
         if ignore_pauli_phase:
             # Fast path used in copy operations, where the phase of the PauliList is already known
@@ -166,10 +177,11 @@ def __init__(
         # Initialize LinearOp
         super().__init__(num_qubits=self._pauli_list.num_qubits)
 
-    def __array__(self, dtype=None):
-        if dtype:
-            return np.asarray(self.to_matrix(), dtype=dtype)
-        return self.to_matrix()
+    def __array__(self, dtype=None, copy=None):
+        if copy is False:
+            raise ValueError("unable to avoid copy while creating an array as requested")
+        arr = self.to_matrix()
+        return arr if dtype is None else arr.astype(dtype, copy=False)
 
     def __repr__(self):
         prefix = "SparsePauliOp("
@@ -919,24 +931,39 @@ def to_list(self, array: bool = False):
             return labels
         return labels.tolist()
 
-    def to_matrix(self, sparse: bool = False) -> np.ndarray:
+    def to_matrix(self, sparse: bool = False, force_serial: bool = False) -> np.ndarray:
         """Convert to a dense or sparse matrix.
 
         Args:
-            sparse (bool): if True return a sparse CSR matrix, otherwise
-                           return dense Numpy array (Default: False).
+            sparse: if ``True`` return a sparse CSR matrix, otherwise return dense Numpy
+                array (the default).
+            force_serial: if ``True``, use an unthreaded implementation, regardless of the state of
+                the `Qiskit threading-control environment variables
+                `__.
+                By default, this will use threaded parallelism over the available CPUs.
 
         Returns:
             array: A dense matrix if `sparse=False`.
             csr_matrix: A sparse matrix in CSR format if `sparse=True`.
         """
-        mat = None
-        for i in self.matrix_iter(sparse=sparse):
-            if mat is None:
-                mat = i
-            else:
-                mat += i
-        return mat
+        if self.coeffs.dtype == object:
+            # Fallback to slow Python-space method.
+            return sum(self.matrix_iter(sparse=sparse))
+
+        pauli_list = self.paulis
+        zx = ZXPaulis(
+            pauli_list.x.astype(np.bool_),
+            pauli_list.z.astype(np.bool_),
+            pauli_list.phase.astype(np.uint8),
+            self.coeffs.astype(np.complex128),
+        )
+        if sparse:
+            from scipy.sparse import csr_matrix
+
+            data, indices, indptr = to_matrix_sparse(zx, force_serial=force_serial)
+            side = 1 << self.num_qubits
+            return csr_matrix((data, indices, indptr), shape=(side, side))
+        return to_matrix_dense(zx, force_serial=force_serial)
 
     def to_operator(self) -> Operator:
         """Convert to a matrix Operator object"""
diff --git a/qiskit/quantum_info/states/densitymatrix.py b/qiskit/quantum_info/states/densitymatrix.py
index 07cc65685745..1c66d8bcf5cf 100644
--- a/qiskit/quantum_info/states/densitymatrix.py
+++ b/qiskit/quantum_info/states/densitymatrix.py
@@ -15,10 +15,11 @@
 """
 
 from __future__ import annotations
-import copy
+import copy as _copy
 from numbers import Number
 import numpy as np
 
+from qiskit import _numpy_compat
 from qiskit.circuit.quantumcircuit import QuantumCircuit
 from qiskit.circuit.instruction import Instruction
 from qiskit.exceptions import QiskitError
@@ -110,10 +111,9 @@ def __init__(
             raise QiskitError("Invalid DensityMatrix input: not a square matrix.")
         super().__init__(op_shape=OpShape.auto(shape=self._data.shape, dims_l=dims, dims_r=dims))
 
-    def __array__(self, dtype=None):
-        if dtype:
-            return np.asarray(self.data, dtype=dtype)
-        return self.data
+    def __array__(self, dtype=None, copy=_numpy_compat.COPY_ONLY_IF_NEEDED):
+        dtype = self.data.dtype if dtype is None else dtype
+        return np.array(self.data, dtype=dtype, copy=copy)
 
     def __eq__(self, other):
         return super().__eq__(other) and np.allclose(
@@ -241,7 +241,7 @@ def tensor(self, other: DensityMatrix) -> DensityMatrix:
         """
         if not isinstance(other, DensityMatrix):
             other = DensityMatrix(other)
-        ret = copy.copy(self)
+        ret = _copy.copy(self)
         ret._data = np.kron(self._data, other._data)
         ret._op_shape = self._op_shape.tensor(other._op_shape)
         return ret
@@ -260,7 +260,7 @@ def expand(self, other: DensityMatrix) -> DensityMatrix:
         """
         if not isinstance(other, DensityMatrix):
             other = DensityMatrix(other)
-        ret = copy.copy(self)
+        ret = _copy.copy(self)
         ret._data = np.kron(other._data, self._data)
         ret._op_shape = self._op_shape.expand(other._op_shape)
         return ret
@@ -281,7 +281,7 @@ def _add(self, other):
         if not isinstance(other, DensityMatrix):
             other = DensityMatrix(other)
         self._op_shape._validate_add(other._op_shape)
-        ret = copy.copy(self)
+        ret = _copy.copy(self)
         ret._data = self.data + other.data
         return ret
 
@@ -299,7 +299,7 @@ def _multiply(self, other):
         """
         if not isinstance(other, Number):
             raise QiskitError("other is not a number")
-        ret = copy.copy(self)
+        ret = _copy.copy(self)
         ret._data = other * self.data
         return ret
 
@@ -356,7 +356,7 @@ def reverse_qargs(self) -> DensityMatrix:
         Returns:
             DensityMatrix: the state with reversed subsystem order.
         """
-        ret = copy.copy(self)
+        ret = _copy.copy(self)
         axes = tuple(range(self._op_shape._num_qargs_l - 1, -1, -1))
         axes = axes + tuple(len(axes) + i for i in axes)
         ret._data = np.reshape(
@@ -523,7 +523,7 @@ def reset(self, qargs: list[int] | None = None) -> DensityMatrix:
         """
         if qargs is None:
             # Resetting all qubits does not require sampling or RNG
-            ret = copy.copy(self)
+            ret = _copy.copy(self)
             state = np.zeros(self._op_shape.shape, dtype=complex)
             state[0, 0] = 1
             ret._data = state
@@ -715,7 +715,7 @@ def _evolve_operator(self, other, qargs=None):
         new_shape._dims_r = new_shape._dims_l
         new_shape._num_qargs_r = new_shape._num_qargs_l
 
-        ret = copy.copy(self)
+        ret = _copy.copy(self)
         if qargs is None:
             # Evolution on full matrix
             op_mat = other.data
@@ -792,7 +792,7 @@ def _evolve_instruction(self, obj, qargs=None):
         """Return a new statevector by applying an instruction."""
         if isinstance(obj, QuantumCircuit):
             obj = obj.to_instruction()
-        vec = copy.copy(self)
+        vec = _copy.copy(self)
         vec._append_instruction(obj, qargs=qargs)
         return vec
 
diff --git a/qiskit/quantum_info/states/statevector.py b/qiskit/quantum_info/states/statevector.py
index b13ccde21743..df39ba42f915 100644
--- a/qiskit/quantum_info/states/statevector.py
+++ b/qiskit/quantum_info/states/statevector.py
@@ -14,13 +14,14 @@
 Statevector quantum state class.
 """
 from __future__ import annotations
-import copy
+import copy as _copy
 import math
 import re
 from numbers import Number
 
 import numpy as np
 
+from qiskit import _numpy_compat
 from qiskit.circuit.quantumcircuit import QuantumCircuit
 from qiskit.circuit.instruction import Instruction
 from qiskit.exceptions import QiskitError
@@ -104,10 +105,9 @@ def __init__(
                 raise QiskitError("Invalid input: not a vector or column-vector.")
         super().__init__(op_shape=OpShape.auto(shape=shape, dims_l=dims, num_qubits_r=0))
 
-    def __array__(self, dtype=None):
-        if dtype:
-            return np.asarray(self.data, dtype=dtype)
-        return self.data
+    def __array__(self, dtype=None, copy=_numpy_compat.COPY_ONLY_IF_NEEDED):
+        dtype = self.data.dtype if dtype is None else dtype
+        return np.array(self.data, dtype=dtype, copy=copy)
 
     def __eq__(self, other):
         return super().__eq__(other) and np.allclose(
@@ -277,7 +277,7 @@ def tensor(self, other: Statevector) -> Statevector:
         """
         if not isinstance(other, Statevector):
             other = Statevector(other)
-        ret = copy.copy(self)
+        ret = _copy.copy(self)
         ret._op_shape = self._op_shape.tensor(other._op_shape)
         ret._data = np.kron(self._data, other._data)
         return ret
@@ -318,7 +318,7 @@ def expand(self, other: Statevector) -> Statevector:
         """
         if not isinstance(other, Statevector):
             other = Statevector(other)
-        ret = copy.copy(self)
+        ret = _copy.copy(self)
         ret._op_shape = self._op_shape.expand(other._op_shape)
         ret._data = np.kron(other._data, self._data)
         return ret
@@ -339,7 +339,7 @@ def _add(self, other):
         if not isinstance(other, Statevector):
             other = Statevector(other)
         self._op_shape._validate_add(other._op_shape)
-        ret = copy.copy(self)
+        ret = _copy.copy(self)
         ret._data = self.data + other.data
         return ret
 
@@ -357,7 +357,7 @@ def _multiply(self, other):
         """
         if not isinstance(other, Number):
             raise QiskitError("other is not a number")
-        ret = copy.copy(self)
+        ret = _copy.copy(self)
         ret._data = other * self.data
         return ret
 
@@ -382,7 +382,7 @@ def evolve(
             qargs = getattr(other, "qargs", None)
 
         # Get return vector
-        ret = copy.copy(self)
+        ret = _copy.copy(self)
 
         # Evolution by a circuit or instruction
         if isinstance(other, QuantumCircuit):
@@ -448,7 +448,7 @@ def reverse_qargs(self) -> Statevector:
         Returns:
             Statevector: the Statevector with reversed subsystem order.
         """
-        ret = copy.copy(self)
+        ret = _copy.copy(self)
         axes = tuple(range(self._op_shape._num_qargs_l - 1, -1, -1))
         ret._data = np.reshape(
             np.transpose(np.reshape(self.data, self._op_shape.tensor_shape), axes),
@@ -619,7 +619,7 @@ def reset(self, qargs: list[int] | None = None) -> Statevector:
         """
         if qargs is None:
             # Resetting all qubits does not require sampling or RNG
-            ret = copy.copy(self)
+            ret = _copy.copy(self)
             state = np.zeros(self._op_shape.shape, dtype=complex)
             state[0] = 1
             ret._data = state
diff --git a/qiskit/synthesis/__init__.py b/qiskit/synthesis/__init__.py
index c191eac97472..49e7885d5096 100644
--- a/qiskit/synthesis/__init__.py
+++ b/qiskit/synthesis/__init__.py
@@ -51,6 +51,7 @@
 .. autofunction:: synth_permutation_depth_lnn_kms
 .. autofunction:: synth_permutation_basic
 .. autofunction:: synth_permutation_acg
+.. autofunction:: synth_permutation_reverse_lnn_kms
 
 Clifford Synthesis
 ==================
@@ -140,6 +141,7 @@
     synth_permutation_depth_lnn_kms,
     synth_permutation_basic,
     synth_permutation_acg,
+    synth_permutation_reverse_lnn_kms,
 )
 from .linear import (
     synth_cnot_count_full_pmh,
diff --git a/qiskit/synthesis/linear_phase/cz_depth_lnn.py b/qiskit/synthesis/linear_phase/cz_depth_lnn.py
index b3931d078179..6dc7db5d619b 100644
--- a/qiskit/synthesis/linear_phase/cz_depth_lnn.py
+++ b/qiskit/synthesis/linear_phase/cz_depth_lnn.py
@@ -24,24 +24,10 @@
 
 import numpy as np
 from qiskit.circuit import QuantumCircuit
-
-
-def _append_cx_stage1(qc, n):
-    """A single layer of CX gates."""
-    for i in range(n // 2):
-        qc.cx(2 * i, 2 * i + 1)
-    for i in range((n + 1) // 2 - 1):
-        qc.cx(2 * i + 2, 2 * i + 1)
-    return qc
-
-
-def _append_cx_stage2(qc, n):
-    """A single layer of CX gates."""
-    for i in range(n // 2):
-        qc.cx(2 * i + 1, 2 * i)
-    for i in range((n + 1) // 2 - 1):
-        qc.cx(2 * i + 1, 2 * i + 2)
-    return qc
+from qiskit.synthesis.permutation.permutation_reverse_lnn import (
+    _append_cx_stage1,
+    _append_cx_stage2,
+)
 
 
 def _odd_pattern1(n):
diff --git a/qiskit/synthesis/permutation/__init__.py b/qiskit/synthesis/permutation/__init__.py
index 7cc8d0174d71..5a8b9a7a13f8 100644
--- a/qiskit/synthesis/permutation/__init__.py
+++ b/qiskit/synthesis/permutation/__init__.py
@@ -15,3 +15,4 @@
 
 from .permutation_lnn import synth_permutation_depth_lnn_kms
 from .permutation_full import synth_permutation_basic, synth_permutation_acg
+from .permutation_reverse_lnn import synth_permutation_reverse_lnn_kms
diff --git a/qiskit/synthesis/permutation/permutation_reverse_lnn.py b/qiskit/synthesis/permutation/permutation_reverse_lnn.py
new file mode 100644
index 000000000000..26287a06177e
--- /dev/null
+++ b/qiskit/synthesis/permutation/permutation_reverse_lnn.py
@@ -0,0 +1,90 @@
+# This code is part of Qiskit.
+#
+# (C) Copyright IBM 2024
+#
+# This code is licensed under the Apache License, Version 2.0. You may
+# obtain a copy of this license in the LICENSE.txt file in the root directory
+# of this source tree or at http://www.apache.org/licenses/LICENSE-2.0.
+#
+# Any modifications or derivative works of this code must retain this
+# copyright notice, and modified files need to carry a notice indicating
+# that they have been altered from the originals.
+"""
+Synthesis of a reverse permutation for LNN connectivity.
+"""
+
+from qiskit.circuit import QuantumCircuit
+
+
+def _append_cx_stage1(qc, n):
+    """A single layer of CX gates."""
+    for i in range(n // 2):
+        qc.cx(2 * i, 2 * i + 1)
+    for i in range((n + 1) // 2 - 1):
+        qc.cx(2 * i + 2, 2 * i + 1)
+    return qc
+
+
+def _append_cx_stage2(qc, n):
+    """A single layer of CX gates."""
+    for i in range(n // 2):
+        qc.cx(2 * i + 1, 2 * i)
+    for i in range((n + 1) // 2 - 1):
+        qc.cx(2 * i + 1, 2 * i + 2)
+    return qc
+
+
+def _append_reverse_permutation_lnn_kms(qc: QuantumCircuit, num_qubits: int) -> None:
+    """
+    Append reverse permutation to a QuantumCircuit for linear nearest-neighbor architectures
+    using Kutin, Moulton, Smithline method.
+
+    Synthesis algorithm for reverse permutation from [1], section 5.
+    This algorithm synthesizes the reverse permutation on :math:`n` qubits over
+    a linear nearest-neighbor architecture using CX gates with depth :math:`2 * n + 2`.
+
+    Args:
+        qc: The original quantum circuit.
+        num_qubits: The number of qubits.
+
+    Returns:
+        The quantum circuit with appended reverse permutation.
+
+    References:
+        1. Kutin, S., Moulton, D. P., Smithline, L.,
+           *Computation at a distance*, Chicago J. Theor. Comput. Sci., vol. 2007, (2007),
+           `arXiv:quant-ph/0701194 `_
+    """
+
+    for _ in range((num_qubits + 1) // 2):
+        _append_cx_stage1(qc, num_qubits)
+        _append_cx_stage2(qc, num_qubits)
+    if (num_qubits % 2) == 0:
+        _append_cx_stage1(qc, num_qubits)
+
+
+def synth_permutation_reverse_lnn_kms(num_qubits: int) -> QuantumCircuit:
+    """
+    Synthesize reverse permutation for linear nearest-neighbor architectures using
+    Kutin, Moulton, Smithline method.
+
+    Synthesis algorithm for reverse permutation from [1], section 5.
+    This algorithm synthesizes the reverse permutation on :math:`n` qubits over
+    a linear nearest-neighbor architecture using CX gates with depth :math:`2 * n + 2`.
+
+    Args:
+        num_qubits: The number of qubits.
+
+    Returns:
+        The synthesized quantum circuit.
+
+    References:
+        1. Kutin, S., Moulton, D. P., Smithline, L.,
+           *Computation at a distance*, Chicago J. Theor. Comput. Sci., vol. 2007, (2007),
+           `arXiv:quant-ph/0701194 `_
+    """
+
+    qc = QuantumCircuit(num_qubits)
+    _append_reverse_permutation_lnn_kms(qc, num_qubits)
+
+    return qc
diff --git a/qiskit/synthesis/qft/qft_decompose_lnn.py b/qiskit/synthesis/qft/qft_decompose_lnn.py
index 4dd8d9d56d13..a54be481f51b 100644
--- a/qiskit/synthesis/qft/qft_decompose_lnn.py
+++ b/qiskit/synthesis/qft/qft_decompose_lnn.py
@@ -15,7 +15,7 @@
 
 import numpy as np
 from qiskit.circuit import QuantumCircuit
-from qiskit.synthesis.linear_phase.cz_depth_lnn import _append_cx_stage1, _append_cx_stage2
+from qiskit.synthesis.permutation.permutation_reverse_lnn import _append_reverse_permutation_lnn_kms
 
 
 def synth_qft_line(
@@ -65,10 +65,6 @@ def synth_qft_line(
     if not do_swaps:
         # Add a reversal network for LNN connectivity in depth 2*n+2,
         # based on Kutin at al., https://arxiv.org/abs/quant-ph/0701194, Section 5.
-        for _ in range((num_qubits + 1) // 2):
-            qc = _append_cx_stage1(qc, num_qubits)
-            qc = _append_cx_stage2(qc, num_qubits)
-        if (num_qubits % 2) == 0:
-            qc = _append_cx_stage1(qc, num_qubits)
+        _append_reverse_permutation_lnn_kms(qc, num_qubits)
 
     return qc
diff --git a/qiskit/synthesis/two_qubit/two_qubit_decompose.py b/qiskit/synthesis/two_qubit/two_qubit_decompose.py
index ee6f40b58c8c..41ba75c6b237 100644
--- a/qiskit/synthesis/two_qubit/two_qubit_decompose.py
+++ b/qiskit/synthesis/two_qubit/two_qubit_decompose.py
@@ -29,14 +29,27 @@
 import io
 import base64
 import warnings
-from typing import Optional, Type
+from typing import Optional, Type, TYPE_CHECKING
 
 import logging
 
 import numpy as np
 
 from qiskit.circuit import QuantumRegister, QuantumCircuit, Gate
-from qiskit.circuit.library.standard_gates import CXGate, U3Gate, U2Gate, U1Gate
+from qiskit.circuit.library.standard_gates import (
+    CXGate,
+    U3Gate,
+    U2Gate,
+    U1Gate,
+    UGate,
+    PhaseGate,
+    RXGate,
+    RYGate,
+    RZGate,
+    SXGate,
+    XGate,
+    RGate,
+)
 from qiskit.exceptions import QiskitError
 from qiskit.quantum_info.operators import Operator
 from qiskit.synthesis.one_qubit.one_qubit_decompose import (
@@ -46,9 +59,28 @@
 from qiskit.utils.deprecation import deprecate_func
 from qiskit._accelerate import two_qubit_decompose
 
+if TYPE_CHECKING:
+    from qiskit.dagcircuit.dagcircuit import DAGCircuit
+
 logger = logging.getLogger(__name__)
 
 
+GATE_NAME_MAP = {
+    "cx": CXGate,
+    "rx": RXGate,
+    "sx": SXGate,
+    "x": XGate,
+    "rz": RZGate,
+    "u": UGate,
+    "p": PhaseGate,
+    "u1": U1Gate,
+    "u2": U2Gate,
+    "u3": U3Gate,
+    "ry": RYGate,
+    "r": RGate,
+}
+
+
 def decompose_two_qubit_product_gate(special_unitary_matrix: np.ndarray):
     r"""Decompose :math:`U = U_l \otimes U_r` where :math:`U \in SU(4)`,
     and :math:`U_l,~U_r \in SU(2)`.
@@ -481,6 +513,7 @@ class TwoQubitBasisDecomposer:
             If ``False``, don't attempt optimization. If ``None``, attempt optimization but don't raise
             if unknown.
 
+
     .. automethod:: __call__
     """
 
@@ -585,9 +618,10 @@ def __call__(
         unitary: Operator | np.ndarray,
         basis_fidelity: float | None = None,
         approximate: bool = True,
+        use_dag: bool = False,
         *,
         _num_basis_uses: int | None = None,
-    ) -> QuantumCircuit:
+    ) -> QuantumCircuit | DAGCircuit:
         r"""Decompose a two-qubit ``unitary`` over fixed basis and :math:`SU(2)` using the best
         approximation given that each basis application has a finite ``basis_fidelity``.
 
@@ -596,6 +630,8 @@ def __call__(
             basis_fidelity (float or None): Fidelity to be assumed for applications of KAK Gate.
                 If given, overrides ``basis_fidelity`` given at init.
             approximate (bool): Approximates if basis fidelities are less than 1.0.
+            use_dag (bool): If true a :class:`.DAGCircuit` is returned instead of a
+                :class:`QuantumCircuit` when this class is called.
             _num_basis_uses (int): force a particular approximation by passing a number in [0, 3].
 
         Returns:
@@ -612,26 +648,40 @@ def __call__(
             _num_basis_uses=_num_basis_uses,
         )
         q = QuantumRegister(2)
-        circ = QuantumCircuit(q, global_phase=sequence.global_phase)
-        for name, params, qubits in sequence:
-            try:
-                getattr(circ, name)(*params, *qubits)
-            except AttributeError as exc:
+        if use_dag:
+            from qiskit.dagcircuit.dagcircuit import DAGCircuit
+
+            dag = DAGCircuit()
+            dag.global_phase = sequence.global_phase
+            dag.add_qreg(q)
+            for name, params, qubits in sequence:
                 if name == "USER_GATE":
-                    circ.append(self.gate, qubits)
-                elif name == "u3":
-                    gate = U3Gate(*params)
-                    circ.append(gate, qubits)
-                elif name == "u2":
-                    gate = U2Gate(*params)
-                    circ.append(gate, qubits)
-                elif name == "u1":
-                    gate = U1Gate(*params)
-                    circ.append(gate, qubits)
+                    dag.apply_operation_back(self.gate, tuple(q[x] for x in qubits), check=False)
                 else:
-                    raise QiskitError(f"Unknown gate {name}") from exc
-
-        return circ
+                    gate = GATE_NAME_MAP[name](*params)
+                    dag.apply_operation_back(gate, tuple(q[x] for x in qubits), check=False)
+            return dag
+        else:
+            circ = QuantumCircuit(q, global_phase=sequence.global_phase)
+            for name, params, qubits in sequence:
+                try:
+                    getattr(circ, name)(*params, *qubits)
+                except AttributeError as exc:
+                    if name == "USER_GATE":
+                        circ.append(self.gate, qubits)
+                    elif name == "u3":
+                        gate = U3Gate(*params)
+                        circ.append(gate, qubits)
+                    elif name == "u2":
+                        gate = U2Gate(*params)
+                        circ.append(gate, qubits)
+                    elif name == "u1":
+                        gate = U1Gate(*params)
+                        circ.append(gate, qubits)
+                    else:
+                        raise QiskitError(f"Unknown gate {name}") from exc
+
+            return circ
 
     def traces(self, target):
         r"""
diff --git a/qiskit/synthesis/two_qubit/xx_decompose/decomposer.py b/qiskit/synthesis/two_qubit/xx_decompose/decomposer.py
index 54a7b3b8da4f..e9394d3919f1 100644
--- a/qiskit/synthesis/two_qubit/xx_decompose/decomposer.py
+++ b/qiskit/synthesis/two_qubit/xx_decompose/decomposer.py
@@ -230,6 +230,7 @@ def __call__(
         unitary: Operator | np.ndarray,
         basis_fidelity: dict | float | None = None,
         approximate: bool = True,
+        use_dag: bool = False,
     ) -> QuantumCircuit:
         r"""
         Fashions a circuit which (perhaps approximately) models the special unitary operation
@@ -246,6 +247,8 @@ def __call__(
                 interpreted as ``{pi: f, pi/2: f/2, pi/3: f/3}``.
                 If given, overrides the basis_fidelity given at init.
             approximate (bool): Approximates if basis fidelities are less than 1.0 .
+            use_dag (bool): If true a :class:`.DAGCircuit` is returned instead of a
+                :class:`QuantumCircuit` when this class is called.
 
         Returns:
             QuantumCircuit: Synthesized circuit.
@@ -279,7 +282,7 @@ def __call__(
             and self.backup_optimizer is not None
         ):
             pi2_fidelity = 1 - strength_to_infidelity[np.pi / 2]
-            return self.backup_optimizer(unitary, basis_fidelity=pi2_fidelity)
+            return self.backup_optimizer(unitary, basis_fidelity=pi2_fidelity, use_dag=use_dag)
 
         # change to positive canonical coordinates
         if weyl_decomposition.c >= -EPSILON:
@@ -314,5 +317,8 @@ def __call__(
         circ.append(UnitaryGate(weyl_decomposition.K1l), [1])
 
         circ = self._decomposer1q(circ)
+        if use_dag:
+            from qiskit.converters import circuit_to_dag
 
+            return circuit_to_dag(circ, copy_operations=False)
         return circ
diff --git a/qiskit/transpiler/passes/optimization/commutation_analysis.py b/qiskit/transpiler/passes/optimization/commutation_analysis.py
index 751e3d8d4f5f..eddb659f0a25 100644
--- a/qiskit/transpiler/passes/optimization/commutation_analysis.py
+++ b/qiskit/transpiler/passes/optimization/commutation_analysis.py
@@ -47,7 +47,7 @@ def run(self, dag):
         # self.property_set['commutation_set'][wire][(node, wire)] will give the
         # commutation set that contains node.
 
-        for wire in dag.wires:
+        for wire in dag.qubits:
             self.property_set["commutation_set"][wire] = []
 
         # Add edges to the dictionary for each qubit
@@ -56,7 +56,7 @@ def run(self, dag):
                 self.property_set["commutation_set"][(node, edge_wire)] = -1
 
         # Construct the commutation set
-        for wire in dag.wires:
+        for wire in dag.qubits:
 
             for current_gate in dag.nodes_on_wire(wire):
 
diff --git a/qiskit/transpiler/passes/optimization/commutative_cancellation.py b/qiskit/transpiler/passes/optimization/commutative_cancellation.py
index b0eb6bd24137..396186fa95cc 100644
--- a/qiskit/transpiler/passes/optimization/commutative_cancellation.py
+++ b/qiskit/transpiler/passes/optimization/commutative_cancellation.py
@@ -99,7 +99,7 @@ def run(self, dag):
         #  - For 2qbit gates the key: (gate_type, first_qbit, sec_qbit, first commutation_set_id,
         #    sec_commutation_set_id), the value is the list gates that share the same gate type,
         #    qubits and commutation sets.
-        for wire in dag.wires:
+        for wire in dag.qubits:
             wire_commutation_set = self.property_set["commutation_set"][wire]
 
             for com_set_idx, com_set in enumerate(wire_commutation_set):
diff --git a/qiskit/transpiler/passes/optimization/template_matching/forward_match.py b/qiskit/transpiler/passes/optimization/template_matching/forward_match.py
index decc24453b06..627db502d33e 100644
--- a/qiskit/transpiler/passes/optimization/template_matching/forward_match.py
+++ b/qiskit/transpiler/passes/optimization/template_matching/forward_match.py
@@ -148,9 +148,7 @@ def _find_forward_candidates(self, node_id_t):
 
         if self.template_dag_dep.direct_successors(node_id_t):
             maximal_index = self.template_dag_dep.direct_successors(node_id_t)[-1]
-            for elem in pred:
-                if elem > maximal_index:
-                    pred.remove(elem)
+            pred = [elem for elem in pred if elem <= maximal_index]
 
         block = []
         for node_id in pred:
diff --git a/qiskit/transpiler/passes/scheduling/alignments/reschedule.py b/qiskit/transpiler/passes/scheduling/alignments/reschedule.py
index 618186a34f9c..5cab7028745b 100644
--- a/qiskit/transpiler/passes/scheduling/alignments/reschedule.py
+++ b/qiskit/transpiler/passes/scheduling/alignments/reschedule.py
@@ -17,6 +17,7 @@
 from qiskit.circuit.gate import Gate
 from qiskit.circuit.delay import Delay
 from qiskit.circuit.measure import Measure
+from qiskit.circuit.reset import Reset
 from qiskit.dagcircuit import DAGCircuit, DAGOpNode, DAGOutNode
 from qiskit.transpiler.basepasses import AnalysisPass
 from qiskit.transpiler.exceptions import TranspilerError
@@ -121,7 +122,7 @@ def _push_node_back(self, dag: DAGCircuit, node: DAGOpNode):
 
         if isinstance(node.op, Gate):
             alignment = self.pulse_align
-        elif isinstance(node.op, Measure):
+        elif isinstance(node.op, (Measure, Reset)):
             alignment = self.acquire_align
         elif isinstance(node.op, Delay) or getattr(node.op, "_directive", False):
             # Directive or delay. These can start at arbitrary time.
@@ -143,7 +144,7 @@ def _push_node_back(self, dag: DAGCircuit, node: DAGOpNode):
         # Compute shifted t1 of this node separately for qreg and creg
         new_t1q = this_t0 + node.op.duration
         this_qubits = set(node.qargs)
-        if isinstance(node.op, Measure):
+        if isinstance(node.op, (Measure, Reset)):
             # creg access ends at the end of instruction
             new_t1c = new_t1q
             this_clbits = set(node.cargs)
@@ -161,7 +162,7 @@ def _push_node_back(self, dag: DAGCircuit, node: DAGOpNode):
             # Compute next node start time separately for qreg and creg
             next_t0q = node_start_time[next_node]
             next_qubits = set(next_node.qargs)
-            if isinstance(next_node.op, Measure):
+            if isinstance(next_node.op, (Measure, Reset)):
                 # creg access starts after write latency
                 next_t0c = next_t0q + clbit_write_latency
                 next_clbits = set(next_node.cargs)
diff --git a/qiskit/transpiler/passes/scheduling/base_scheduler.py b/qiskit/transpiler/passes/scheduling/base_scheduler.py
index 78e2660e505d..4085844a4709 100644
--- a/qiskit/transpiler/passes/scheduling/base_scheduler.py
+++ b/qiskit/transpiler/passes/scheduling/base_scheduler.py
@@ -11,11 +11,13 @@
 # that they have been altered from the originals.
 
 """Base circuit scheduling pass."""
+import warnings
+
 from qiskit.transpiler import InstructionDurations
 from qiskit.transpiler.basepasses import TransformationPass
 from qiskit.transpiler.passes.scheduling.time_unit_conversion import TimeUnitConversion
-from qiskit.dagcircuit import DAGOpNode, DAGCircuit
-from qiskit.circuit import Delay, Gate
+from qiskit.dagcircuit import DAGOpNode, DAGCircuit, DAGOutNode
+from qiskit.circuit import Delay, Gate, Measure, Reset
 from qiskit.circuit.parameterexpression import ParameterExpression
 from qiskit.transpiler.exceptions import TranspilerError
 from qiskit.transpiler.target import Target
@@ -269,6 +271,23 @@ def _get_node_duration(
         else:
             duration = node.op.duration
 
+        if isinstance(node.op, Reset):
+            warnings.warn(
+                "Qiskit scheduler assumes Reset works similarly to Measure instruction. "
+                "Actual behavior depends on the control system of your quantum backend. "
+                "Your backend may provide a plugin scheduler pass."
+            )
+        elif isinstance(node.op, Measure):
+            is_mid_circuit = not any(
+                isinstance(x, DAGOutNode) for x in dag.quantum_successors(node)
+            )
+            if is_mid_circuit:
+                warnings.warn(
+                    "Qiskit scheduler assumes mid-circuit measurement works as a standard instruction. "
+                    "Actual backend may apply custom scheduling. "
+                    "Your backend may provide a plugin scheduler pass."
+                )
+
         if isinstance(duration, ParameterExpression):
             raise TranspilerError(
                 f"Parameterized duration ({duration}) "
diff --git a/qiskit/transpiler/passes/scheduling/dynamical_decoupling.py b/qiskit/transpiler/passes/scheduling/dynamical_decoupling.py
index 5b84b529e453..12f4bc515b29 100644
--- a/qiskit/transpiler/passes/scheduling/dynamical_decoupling.py
+++ b/qiskit/transpiler/passes/scheduling/dynamical_decoupling.py
@@ -1,6 +1,6 @@
 # This code is part of Qiskit.
 #
-# (C) Copyright IBM 2021.
+# (C) Copyright IBM 2021, 2024.
 #
 # This code is licensed under the Apache License, Version 2.0. You may
 # obtain a copy of this license in the LICENSE.txt file in the root directory
@@ -20,6 +20,7 @@
 from qiskit.dagcircuit import DAGOpNode, DAGInNode
 from qiskit.quantum_info.operators.predicates import matrix_equal
 from qiskit.synthesis.one_qubit import OneQubitEulerDecomposer
+from qiskit.transpiler import InstructionDurations
 from qiskit.transpiler.passes.optimization import Optimize1qGates
 from qiskit.transpiler.basepasses import TransformationPass
 from qiskit.transpiler.exceptions import TranspilerError
@@ -168,6 +169,8 @@ def run(self, dag):
         if dag.duration is None:
             raise TranspilerError("DD runs after circuit is scheduled.")
 
+        durations = self._update_inst_durations(dag)
+
         num_pulses = len(self._dd_sequence)
         sequence_gphase = 0
         if num_pulses != 1:
@@ -208,7 +211,7 @@ def run(self, dag):
             for index, gate in enumerate(self._dd_sequence):
                 gate = gate.to_mutable()
                 self._dd_sequence[index] = gate
-                gate.duration = self._durations.get(gate, physical_qubit)
+                gate.duration = durations.get(gate, physical_qubit)
 
                 dd_sequence_duration += gate.duration
             index_sequence_duration_map[physical_qubit] = dd_sequence_duration
@@ -277,6 +280,26 @@ def run(self, dag):
 
         return new_dag
 
+    def _update_inst_durations(self, dag):
+        """Update instruction durations with circuit information. If the dag contains gate
+        calibrations and no instruction durations were provided through the target or as a
+        standalone input, the circuit calibration durations will be used.
+        The priority order for instruction durations is: target > standalone > circuit.
+        """
+        circ_durations = InstructionDurations()
+
+        if dag.calibrations:
+            cal_durations = []
+            for gate, gate_cals in dag.calibrations.items():
+                for (qubits, parameters), schedule in gate_cals.items():
+                    cal_durations.append((gate, qubits, parameters, schedule.duration))
+            circ_durations.update(cal_durations, circ_durations.dt)
+
+        if self._durations is not None:
+            circ_durations.update(self._durations, getattr(self._durations, "dt", None))
+
+        return circ_durations
+
     def __gate_supported(self, gate: Gate, qarg: int) -> bool:
         """A gate is supported on the qubit (qarg) or not."""
         if self._target is None or self._target.instruction_supported(gate.name, qargs=(qarg,)):
diff --git a/qiskit/transpiler/passes/scheduling/padding/dynamical_decoupling.py b/qiskit/transpiler/passes/scheduling/padding/dynamical_decoupling.py
index 42a1bdc80f14..7cb309dd9aa1 100644
--- a/qiskit/transpiler/passes/scheduling/padding/dynamical_decoupling.py
+++ b/qiskit/transpiler/passes/scheduling/padding/dynamical_decoupling.py
@@ -1,6 +1,6 @@
 # This code is part of Qiskit.
 #
-# (C) Copyright IBM 2021.
+# (C) Copyright IBM 2021, 2024.
 #
 # This code is licensed under the Apache License, Version 2.0. You may
 # obtain a copy of this license in the LICENSE.txt file in the root directory
@@ -179,9 +179,31 @@ def __init__(
                         f"{gate.name} in dd_sequence is not supported in the target"
                     )
 
+    def _update_inst_durations(self, dag):
+        """Update instruction durations with circuit information. If the dag contains gate
+        calibrations and no instruction durations were provided through the target or as a
+        standalone input, the circuit calibration durations will be used.
+        The priority order for instruction durations is: target > standalone > circuit.
+        """
+        circ_durations = InstructionDurations()
+
+        if dag.calibrations:
+            cal_durations = []
+            for gate, gate_cals in dag.calibrations.items():
+                for (qubits, parameters), schedule in gate_cals.items():
+                    cal_durations.append((gate, qubits, parameters, schedule.duration))
+            circ_durations.update(cal_durations, circ_durations.dt)
+
+        if self._durations is not None:
+            circ_durations.update(self._durations, getattr(self._durations, "dt", None))
+
+        return circ_durations
+
     def _pre_runhook(self, dag: DAGCircuit):
         super()._pre_runhook(dag)
 
+        durations = self._update_inst_durations(dag)
+
         num_pulses = len(self._dd_sequence)
 
         # Check if physical circuit is given
@@ -245,7 +267,7 @@ def _pre_runhook(self, dag: DAGCircuit):
                             f"is not acceptable in {self.__class__.__name__} pass."
                         )
                 except KeyError:
-                    gate_length = self._durations.get(gate, physical_index)
+                    gate_length = durations.get(gate, physical_index)
                 sequence_lengths.append(gate_length)
                 # Update gate duration. This is necessary for current timeline drawer, i.e. scheduled.
                 gate = gate.to_mutable()
diff --git a/qiskit/transpiler/passes/scheduling/time_unit_conversion.py b/qiskit/transpiler/passes/scheduling/time_unit_conversion.py
index d53c3fc4ef6a..25672c137f34 100644
--- a/qiskit/transpiler/passes/scheduling/time_unit_conversion.py
+++ b/qiskit/transpiler/passes/scheduling/time_unit_conversion.py
@@ -1,6 +1,6 @@
 # This code is part of Qiskit.
 #
-# (C) Copyright IBM 2021.
+# (C) Copyright IBM 2021, 2024.
 #
 # This code is licensed under the Apache License, Version 2.0. You may
 # obtain a copy of this license in the LICENSE.txt file in the root directory
@@ -51,6 +51,7 @@ def __init__(self, inst_durations: InstructionDurations = None, target: Target =
         self.inst_durations = inst_durations or InstructionDurations()
         if target is not None:
             self.inst_durations = target.durations()
+        self._durations_provided = inst_durations is not None or target is not None
 
     def run(self, dag: DAGCircuit):
         """Run the TimeUnitAnalysis pass on `dag`.
@@ -64,8 +65,11 @@ def run(self, dag: DAGCircuit):
         Raises:
             TranspilerError: if the units are not unifiable
         """
+
+        inst_durations = self._update_inst_durations(dag)
+
         # Choose unit
-        if self.inst_durations.dt is not None:
+        if inst_durations.dt is not None:
             time_unit = "dt"
         else:
             # Check what units are used in delays and other instructions: dt or SI or mixed
@@ -75,7 +79,7 @@ def run(self, dag: DAGCircuit):
                     "Fail to unify time units in delays. SI units "
                     "and dt unit must not be mixed when dt is not supplied."
                 )
-            units_other = self.inst_durations.units_used()
+            units_other = inst_durations.units_used()
             if self._unified(units_other) == "mixed":
                 raise TranspilerError(
                     "Fail to unify time units in instruction_durations. SI units "
@@ -96,7 +100,7 @@ def run(self, dag: DAGCircuit):
         # Make units consistent
         for node in dag.op_nodes():
             try:
-                duration = self.inst_durations.get(
+                duration = inst_durations.get(
                     node.op, [dag.find_bit(qarg).index for qarg in node.qargs], unit=time_unit
                 )
             except TranspilerError:
@@ -108,6 +112,26 @@ def run(self, dag: DAGCircuit):
         self.property_set["time_unit"] = time_unit
         return dag
 
+    def _update_inst_durations(self, dag):
+        """Update instruction durations with circuit information. If the dag contains gate
+        calibrations and no instruction durations were provided through the target or as a
+        standalone input, the circuit calibration durations will be used.
+        The priority order for instruction durations is: target > standalone > circuit.
+        """
+        circ_durations = InstructionDurations()
+
+        if dag.calibrations:
+            cal_durations = []
+            for gate, gate_cals in dag.calibrations.items():
+                for (qubits, parameters), schedule in gate_cals.items():
+                    cal_durations.append((gate, qubits, parameters, schedule.duration))
+            circ_durations.update(cal_durations, circ_durations.dt)
+
+        if self._durations_provided:
+            circ_durations.update(self.inst_durations, getattr(self.inst_durations, "dt", None))
+
+        return circ_durations
+
     @staticmethod
     def _units_used_in_delays(dag: DAGCircuit) -> Set[str]:
         units_used = set()
diff --git a/qiskit/transpiler/passes/synthesis/high_level_synthesis.py b/qiskit/transpiler/passes/synthesis/high_level_synthesis.py
index 3d3e2a6851af..fd21ae6a75fc 100644
--- a/qiskit/transpiler/passes/synthesis/high_level_synthesis.py
+++ b/qiskit/transpiler/passes/synthesis/high_level_synthesis.py
@@ -133,7 +133,7 @@
    TokenSwapperSynthesisPermutation
 """
 
-from typing import Optional, Union, List, Tuple
+from typing import Optional, Union, List, Tuple, Callable
 
 import numpy as np
 import rustworkx as rx
@@ -227,16 +227,34 @@ class HLSConfig:
     :ref:`using-high-level-synthesis-plugins`.
     """
 
-    def __init__(self, use_default_on_unspecified=True, **kwargs):
+    def __init__(
+        self,
+        use_default_on_unspecified: bool = True,
+        plugin_selection: str = "sequential",
+        plugin_evaluation_fn: Optional[Callable[[QuantumCircuit], int]] = None,
+        **kwargs,
+    ):
         """Creates a high-level-synthesis config.
 
         Args:
-            use_default_on_unspecified (bool): if True, every higher-level-object without an
+            use_default_on_unspecified: if True, every higher-level-object without an
                 explicitly specified list of methods will be synthesized using the "default"
                 algorithm if it exists.
+            plugin_selection: if set to ``"sequential"`` (default), for every higher-level-object
+                the synthesis pass will consider the specified methods sequentially, stopping
+                at the first method that is able to synthesize the object. If set to ``"all"``,
+                all the specified methods will be considered, and the best synthesized circuit,
+                according to ``plugin_evaluation_fn`` will be chosen.
+            plugin_evaluation_fn: a callable that evaluates the quality of the synthesized
+                quantum circuit; a smaller value means a better circuit. If ``None``, the
+                quality of the circuit its size (i.e. the number of gates that it contains).
             kwargs: a dictionary mapping higher-level-objects to lists of synthesis methods.
         """
         self.use_default_on_unspecified = use_default_on_unspecified
+        self.plugin_selection = plugin_selection
+        self.plugin_evaluation_fn = (
+            plugin_evaluation_fn if plugin_evaluation_fn is not None else lambda qc: qc.size()
+        )
         self.methods = {}
 
         for key, value in kwargs.items():
@@ -248,9 +266,6 @@ def set_methods(self, hls_name, hls_methods):
         self.methods[hls_name] = hls_methods
 
 
-# ToDo: Do we have a way to specify optimization criteria (e.g., 2q gate count vs. depth)?
-
-
 class HighLevelSynthesis(TransformationPass):
     """Synthesize higher-level objects and unroll custom definitions.
 
@@ -500,6 +515,9 @@ def _synthesize_op_using_plugins(
         else:
             methods = []
 
+        best_decomposition = None
+        best_score = np.inf
+
         for method in methods:
             # There are two ways to specify a synthesis method. The more explicit
             # way is to specify it as a tuple consisting of a synthesis algorithm and a
@@ -538,11 +556,22 @@ def _synthesize_op_using_plugins(
             )
 
             # The synthesis methods that are not suited for the given higher-level-object
-            # will return None, in which case the next method in the list will be used.
+            # will return None.
             if decomposition is not None:
-                return decomposition
+                if self.hls_config.plugin_selection == "sequential":
+                    # In the "sequential" mode the first successful decomposition is
+                    # returned.
+                    best_decomposition = decomposition
+                    break
 
-        return None
+                # In the "run everything" mode we update the best decomposition
+                # discovered
+                current_score = self.hls_config.plugin_evaluation_fn(decomposition)
+                if current_score < best_score:
+                    best_decomposition = decomposition
+                    best_score = current_score
+
+        return best_decomposition
 
     def _synthesize_annotated_op(self, op: Operation) -> Union[Operation, None]:
         """
@@ -732,9 +761,9 @@ class KMSSynthesisLinearFunction(HighLevelSynthesisPlugin):
 
     * use_inverted: Indicates whether to run the algorithm on the inverse matrix
         and to invert the synthesized circuit.
-        In certain cases this provides a better decomposition then the direct approach.
+        In certain cases this provides a better decomposition than the direct approach.
     * use_transposed: Indicates whether to run the algorithm on the transposed matrix
-        and to invert the order oF CX gates in the synthesized circuit.
+        and to invert the order of CX gates in the synthesized circuit.
         In certain cases this provides a better decomposition than the direct approach.
 
     """
@@ -778,9 +807,9 @@ class PMHSynthesisLinearFunction(HighLevelSynthesisPlugin):
     * section size: The size of each section used in the Patel–Markov–Hayes algorithm [1].
     * use_inverted: Indicates whether to run the algorithm on the inverse matrix
         and to invert the synthesized circuit.
-        In certain cases this provides a better decomposition then the direct approach.
+        In certain cases this provides a better decomposition than the direct approach.
     * use_transposed: Indicates whether to run the algorithm on the transposed matrix
-        and to invert the order oF CX gates in the synthesized circuit.
+        and to invert the order of CX gates in the synthesized circuit.
         In certain cases this provides a better decomposition than the direct approach.
 
     References:
diff --git a/qiskit/transpiler/passes/synthesis/unitary_synthesis.py b/qiskit/transpiler/passes/synthesis/unitary_synthesis.py
index 606df60869c9..a30411d16a93 100644
--- a/qiskit/transpiler/passes/synthesis/unitary_synthesis.py
+++ b/qiskit/transpiler/passes/synthesis/unitary_synthesis.py
@@ -24,7 +24,6 @@
 from __future__ import annotations
 from math import pi, inf, isclose
 from typing import Any
-from copy import deepcopy
 from itertools import product
 from functools import partial
 import numpy as np
@@ -40,6 +39,7 @@
 from qiskit.synthesis.two_qubit.two_qubit_decompose import (
     TwoQubitBasisDecomposer,
     TwoQubitWeylDecomposition,
+    GATE_NAME_MAP,
 )
 from qiskit.quantum_info import Operator
 from qiskit.circuit import ControlFlowOp, Gate, Parameter
@@ -147,20 +147,26 @@ def _error(circuit, target=None, qubits=None):
     of circuit as a weak proxy for error.
     """
     if target is None:
-        return len(circuit)
+        if isinstance(circuit, DAGCircuit):
+            return len(circuit.op_nodes())
+        else:
+            return len(circuit)
     gate_fidelities = []
     gate_durations = []
-    for inst in circuit:
-        inst_qubits = tuple(qubits[circuit.find_bit(q).index] for q in inst.qubits)
+
+    def score_instruction(inst, inst_qubits):
         try:
             keys = target.operation_names_for_qargs(inst_qubits)
             for key in keys:
                 target_op = target.operation_from_name(key)
-                if isinstance(target_op, inst.operation.base_class) and (
+                if isinstance(circuit, DAGCircuit):
+                    op = inst.op
+                else:
+                    op = inst.operation
+                if isinstance(target_op, op.base_class) and (
                     target_op.is_parameterized()
                     or all(
-                        isclose(float(p1), float(p2))
-                        for p1, p2 in zip(target_op.params, inst.operation.params)
+                        isclose(float(p1), float(p2)) for p1, p2 in zip(target_op.params, op.params)
                     )
                 ):
                     inst_props = target[key].get(inst_qubits, None)
@@ -177,10 +183,22 @@ def _error(circuit, target=None, qubits=None):
             else:
                 raise KeyError
         except KeyError as error:
+            if isinstance(circuit, DAGCircuit):
+                op = inst.op
+            else:
+                op = inst.operation
             raise TranspilerError(
-                f"Encountered a bad synthesis. "
-                f"Target has no {inst.operation} on qubits {qubits}."
+                f"Encountered a bad synthesis. " f"Target has no {op} on qubits {qubits}."
             ) from error
+
+    if isinstance(circuit, DAGCircuit):
+        for inst in circuit.topological_op_nodes():
+            inst_qubits = tuple(qubits[circuit.find_bit(q).index] for q in inst.qargs)
+            score_instruction(inst, inst_qubits)
+    else:
+        for inst in circuit:
+            inst_qubits = tuple(qubits[circuit.find_bit(q).index] for q in inst.qubits)
+            score_instruction(inst, inst_qubits)
     # TODO:return np.sum(gate_durations)
     return 1 - np.prod(gate_fidelities)
 
@@ -276,7 +294,7 @@ def __init__(
         natural_direction: bool | None = None,
         synth_gates: list[str] | None = None,
         method: str = "default",
-        min_qubits: int = None,
+        min_qubits: int = 0,
         plugin_config: dict = None,
         target: Target = None,
     ):
@@ -482,27 +500,55 @@ def _run_main_loop(
                 ]
             )
 
-        for node in dag.named_nodes(*self._synth_gates):
-            if self._min_qubits is not None and len(node.qargs) < self._min_qubits:
-                continue
-            synth_dag = None
-            unitary = node.op.to_matrix()
-            n_qubits = len(node.qargs)
-            if (plugin_method.max_qubits is not None and n_qubits > plugin_method.max_qubits) or (
-                plugin_method.min_qubits is not None and n_qubits < plugin_method.min_qubits
-            ):
-                method, kwargs = default_method, default_kwargs
+        out_dag = dag.copy_empty_like()
+        for node in dag.topological_op_nodes():
+            if node.op.name == "unitary" and len(node.qargs) >= self._min_qubits:
+                synth_dag = None
+                unitary = node.op.to_matrix()
+                n_qubits = len(node.qargs)
+                if (
+                    plugin_method.max_qubits is not None and n_qubits > plugin_method.max_qubits
+                ) or (plugin_method.min_qubits is not None and n_qubits < plugin_method.min_qubits):
+                    method, kwargs = default_method, default_kwargs
+                else:
+                    method, kwargs = plugin_method, plugin_kwargs
+                if method.supports_coupling_map:
+                    kwargs["coupling_map"] = (
+                        self._coupling_map,
+                        [qubit_indices[x] for x in node.qargs],
+                    )
+                synth_dag = method.run(unitary, **kwargs)
+                if synth_dag is None:
+                    out_dag.apply_operation_back(node.op, node.qargs, node.cargs, check=False)
+                    continue
+                if isinstance(synth_dag, DAGCircuit):
+                    qubit_map = dict(zip(synth_dag.qubits, node.qargs))
+                    for node in synth_dag.topological_op_nodes():
+                        out_dag.apply_operation_back(
+                            node.op, (qubit_map[x] for x in node.qargs), check=False
+                        )
+                    out_dag.global_phase += synth_dag.global_phase
+                else:
+                    node_list, global_phase, gate = synth_dag
+                    qubits = node.qargs
+                    for (
+                        op_name,
+                        params,
+                        qargs,
+                    ) in node_list:
+                        if op_name == "USER_GATE":
+                            op = gate
+                        else:
+                            op = GATE_NAME_MAP[op_name](*params)
+                        out_dag.apply_operation_back(
+                            op,
+                            (qubits[x] for x in qargs),
+                            check=False,
+                        )
+                    out_dag.global_phase += global_phase
             else:
-                method, kwargs = plugin_method, plugin_kwargs
-            if method.supports_coupling_map:
-                kwargs["coupling_map"] = (
-                    self._coupling_map,
-                    [qubit_indices[x] for x in node.qargs],
-                )
-            synth_dag = method.run(unitary, **kwargs)
-            if synth_dag is not None:
-                dag.substitute_node_with_dag(node, synth_dag)
-        return dag
+                out_dag.apply_operation_back(node.op, node.qargs, node.cargs, check=False)
+        return out_dag
 
 
 def _build_gate_lengths(props=None, target=None):
@@ -876,6 +922,20 @@ def run(self, unitary, **options):
                 decomposers2q = [decomposer2q] if decomposer2q is not None else []
             # choose the cheapest output among synthesized circuits
             synth_circuits = []
+            # If we have a single TwoQubitBasisDecomposer skip dag creation as we don't need to
+            # store and can instead manually create the synthesized gates directly in the output dag
+            if len(decomposers2q) == 1 and isinstance(decomposers2q[0], TwoQubitBasisDecomposer):
+                preferred_direction = _preferred_direction(
+                    decomposers2q[0],
+                    qubits,
+                    natural_direction,
+                    coupling_map,
+                    gate_lengths,
+                    gate_errors,
+                )
+                return self._synth_su4_no_dag(
+                    unitary, decomposers2q[0], preferred_direction, approximation_degree
+                )
             for decomposer2q in decomposers2q:
                 preferred_direction = _preferred_direction(
                     decomposer2q, qubits, natural_direction, coupling_map, gate_lengths, gate_errors
@@ -896,24 +956,57 @@ def run(self, unitary, **options):
 
             # only decompose if needed. TODO: handle basis better
             synth_circuit = qs_decomposition(unitary) if (basis_gates or target) else None
+        if synth_circuit is None:
+            return None
+        if isinstance(synth_circuit, DAGCircuit):
+            return synth_circuit
+        return circuit_to_dag(synth_circuit)
 
-        synth_dag = circuit_to_dag(synth_circuit) if synth_circuit is not None else None
-        return synth_dag
-
-    def _synth_su4(self, su4_mat, decomposer2q, preferred_direction, approximation_degree):
+    def _synth_su4_no_dag(self, unitary, decomposer2q, preferred_direction, approximation_degree):
         approximate = not approximation_degree == 1.0
-        synth_circ = decomposer2q(su4_mat, approximate=approximate)
+        synth_circ = decomposer2q._inner_decomposer(unitary, approximate=approximate)
+        if not preferred_direction:
+            return (synth_circ, synth_circ.global_phase, decomposer2q.gate)
 
+        synth_direction = None
         # if the gates in synthesis are in the opposite direction of the preferred direction
         # resynthesize a new operator which is the original conjugated by swaps.
         # this new operator is doubly mirrored from the original and is locally equivalent.
+        for op_name, _params, qubits in synth_circ:
+            if op_name in {"USER_GATE", "cx"}:
+                synth_direction = qubits
+        if synth_direction is not None and synth_direction != preferred_direction:
+            # TODO: Avoid using a dag to correct the synthesis direction
+            return self._reversed_synth_su4(unitary, decomposer2q, approximation_degree)
+        return (synth_circ, synth_circ.global_phase, decomposer2q.gate)
+
+    def _synth_su4(self, su4_mat, decomposer2q, preferred_direction, approximation_degree):
+        approximate = not approximation_degree == 1.0
+        synth_circ = decomposer2q(su4_mat, approximate=approximate, use_dag=True)
+        if not preferred_direction:
+            return synth_circ
         synth_direction = None
-        for inst in synth_circ:
-            if inst.operation.num_qubits == 2:
-                synth_direction = [synth_circ.find_bit(q).index for q in inst.qubits]
-        if preferred_direction and synth_direction != preferred_direction:
-            su4_mat_mm = deepcopy(su4_mat)
-            su4_mat_mm[[1, 2]] = su4_mat_mm[[2, 1]]
-            su4_mat_mm[:, [1, 2]] = su4_mat_mm[:, [2, 1]]
-            synth_circ = decomposer2q(su4_mat_mm, approximate=approximate).reverse_bits()
+        # if the gates in synthesis are in the opposite direction of the preferred direction
+        # resynthesize a new operator which is the original conjugated by swaps.
+        # this new operator is doubly mirrored from the original and is locally equivalent.
+        for inst in synth_circ.topological_op_nodes():
+            if inst.op.num_qubits == 2:
+                synth_direction = [synth_circ.find_bit(q).index for q in inst.qargs]
+        if synth_direction is not None and synth_direction != preferred_direction:
+            return self._reversed_synth_su4(su4_mat, decomposer2q, approximation_degree)
         return synth_circ
+
+    def _reversed_synth_su4(self, su4_mat, decomposer2q, approximation_degree):
+        approximate = not approximation_degree == 1.0
+        su4_mat_mm = su4_mat.copy()
+        su4_mat_mm[[1, 2]] = su4_mat_mm[[2, 1]]
+        su4_mat_mm[:, [1, 2]] = su4_mat_mm[:, [2, 1]]
+        synth_circ = decomposer2q(su4_mat_mm, approximate=approximate, use_dag=True)
+        out_dag = DAGCircuit()
+        out_dag.global_phase = synth_circ.global_phase
+        out_dag.add_qubits(list(reversed(synth_circ.qubits)))
+        flip_bits = out_dag.qubits[::-1]
+        for node in synth_circ.topological_op_nodes():
+            qubits = tuple(flip_bits[synth_circ.find_bit(x).index] for x in node.qargs)
+            out_dag.apply_operation_back(node.op, qubits, check=False)
+        return out_dag
diff --git a/qiskit/visualization/array.py b/qiskit/visualization/array.py
index b076e38b174d..3a8ef2917156 100644
--- a/qiskit/visualization/array.py
+++ b/qiskit/visualization/array.py
@@ -33,7 +33,7 @@ def _num_to_latex(raw_value, decimals=15, first_term=True, coefficient=False):
     """
     import sympy  # runtime import
 
-    raw_value = np.around(raw_value, decimals=decimals)
+    raw_value = np.around(raw_value, decimals=decimals).item()
     value = sympy.nsimplify(raw_value, rational=False)
 
     if isinstance(value, sympy.core.numbers.Rational) and value.denominator > 50:
diff --git a/qiskit/visualization/circuit/matplotlib.py b/qiskit/visualization/circuit/matplotlib.py
index 8d83fecb8969..c547846acc5b 100644
--- a/qiskit/visualization/circuit/matplotlib.py
+++ b/qiskit/visualization/circuit/matplotlib.py
@@ -33,6 +33,7 @@
     IfElseOp,
     ForLoopOp,
     SwitchCaseOp,
+    CircuitError,
 )
 from qiskit.circuit.controlflow import condition_resources
 from qiskit.circuit.classical import expr
@@ -46,7 +47,8 @@
     XGate,
     ZGate,
 )
-from qiskit.qasm3.exporter import QASM3Builder
+from qiskit.qasm3 import ast
+from qiskit.qasm3.exporter import _ExprBuilder
 from qiskit.qasm3.printer import BasicPrinter
 
 from qiskit.circuit.tools.pi_check import pi_check
@@ -393,7 +395,7 @@ def draw(self, filename=None, verbose=False):
             matplotlib_close_if_inline(mpl_figure)
             return mpl_figure
 
-    def _get_layer_widths(self, node_data, wire_map, outer_circuit, glob_data, builder=None):
+    def _get_layer_widths(self, node_data, wire_map, outer_circuit, glob_data):
         """Compute the layer_widths for the layers"""
 
         layer_widths = {}
@@ -482,18 +484,41 @@ def _get_layer_widths(self, node_data, wire_map, outer_circuit, glob_data, build
                     if (isinstance(op, SwitchCaseOp) and isinstance(op.target, expr.Expr)) or (
                         getattr(op, "condition", None) and isinstance(op.condition, expr.Expr)
                     ):
-                        condition = op.target if isinstance(op, SwitchCaseOp) else op.condition
-                        if builder is None:
-                            builder = QASM3Builder(
-                                outer_circuit,
-                                includeslist=("stdgates.inc",),
-                                basis_gates=("U",),
-                                disable_constants=False,
-                                allow_aliasing=False,
+
+                        def lookup_var(var):
+                            """Look up a classical-expression variable or register/bit in our
+                            internal symbol table, and return an OQ3-like identifier."""
+                            # We don't attempt to disambiguate anything like register/var naming
+                            # collisions; we already don't really show classical variables.
+                            if isinstance(var, expr.Var):
+                                return ast.Identifier(var.name)
+                            if isinstance(var, ClassicalRegister):
+                                return ast.Identifier(var.name)
+                            # Single clbit.  This is not actually the correct way to lookup a bit on
+                            # the circuit (it doesn't handle bit bindings fully), but the mpl
+                            # drawer doesn't completely track inner-outer _bit_ bindings, only
+                            # inner-indices, so we can't fully recover the information losslessly.
+                            # Since most control-flow uses the control-flow builders, we should
+                            # decay to something usable most of the time.
+                            try:
+                                register, bit_index, reg_index = get_bit_reg_index(
+                                    outer_circuit, var
+                                )
+                            except CircuitError:
+                                # We failed to find the bit due to binding problems - fall back to
+                                # something that's probably wrong, but at least disambiguating.
+                                return ast.Identifier(f"bit{wire_map[var]}")
+                            if register is None:
+                                return ast.Identifier(f"bit{bit_index}")
+                            return ast.SubscriptedIdentifier(
+                                register.name, ast.IntegerLiteral(reg_index)
                             )
-                            builder.build_classical_declarations()
+
+                        condition = op.target if isinstance(op, SwitchCaseOp) else op.condition
                         stream = StringIO()
-                        BasicPrinter(stream, indent="  ").visit(builder.build_expression(condition))
+                        BasicPrinter(stream, indent="  ").visit(
+                            condition.accept(_ExprBuilder(lookup_var))
+                        )
                         expr_text = stream.getvalue()
                         # Truncate expr_text so that first gate is no more than about 3 x_index's over
                         if len(expr_text) > self._expr_len:
@@ -570,7 +595,7 @@ def _get_layer_widths(self, node_data, wire_map, outer_circuit, glob_data, build
 
                         # Recursively call _get_layer_widths for the circuit inside the ControlFlowOp
                         flow_widths = flow_drawer._get_layer_widths(
-                            node_data, flow_wire_map, outer_circuit, glob_data, builder
+                            node_data, flow_wire_map, outer_circuit, glob_data
                         )
                         layer_widths.update(flow_widths)
 
@@ -1243,6 +1268,11 @@ def _condition(self, node, node_data, wire_map, outer_circuit, cond_xy, glob_dat
             self._ax.add_patch(box)
             xy_plot.append(xy)
 
+        if not xy_plot:
+            # Expression that's only on new-style `expr.Var` nodes, and doesn't need any vertical
+            # line drawing.
+            return
+
         qubit_b = min(node_data[node].q_xy, key=lambda xy: xy[1])
         clbit_b = min(xy_plot, key=lambda xy: xy[1])
 
diff --git a/qiskit/visualization/circuit/text.py b/qiskit/visualization/circuit/text.py
index 1e6137275a9f..abefe5511775 100644
--- a/qiskit/visualization/circuit/text.py
+++ b/qiskit/visualization/circuit/text.py
@@ -20,7 +20,7 @@
 import collections
 import sys
 
-from qiskit.circuit import Qubit, Clbit, ClassicalRegister
+from qiskit.circuit import Qubit, Clbit, ClassicalRegister, CircuitError
 from qiskit.circuit import ControlledGate, Reset, Measure
 from qiskit.circuit import ControlFlowOp, WhileLoopOp, IfElseOp, ForLoopOp, SwitchCaseOp
 from qiskit.circuit.classical import expr
@@ -28,8 +28,9 @@
 from qiskit.circuit.library.standard_gates import IGate, RZZGate, SwapGate, SXGate, SXdgGate
 from qiskit.circuit.annotated_operation import _canonicalize_modifiers, ControlModifier
 from qiskit.circuit.tools.pi_check import pi_check
-from qiskit.qasm3.exporter import QASM3Builder
+from qiskit.qasm3 import ast
 from qiskit.qasm3.printer import BasicPrinter
+from qiskit.qasm3.exporter import _ExprBuilder
 
 from ._utils import (
     get_gate_ctrl_text,
@@ -748,7 +749,6 @@ def __init__(
 
         self._nest_depth = 0  # nesting depth for control flow ops
         self._expr_text = ""  # expression text to display
-        self._builder = None  # QASM3Builder class instance for expressions
 
         # Because jupyter calls both __repr__ and __repr_html__ for some backends,
         # the entire drawer can be run twice which can result in different output
@@ -1306,25 +1306,44 @@ def add_control_flow(self, node, layers, wire_map):
         if (isinstance(node.op, SwitchCaseOp) and isinstance(node.op.target, expr.Expr)) or (
             getattr(node.op, "condition", None) and isinstance(node.op.condition, expr.Expr)
         ):
+
+            def lookup_var(var):
+                """Look up a classical-expression variable or register/bit in our internal symbol
+                table, and return an OQ3-like identifier."""
+                # We don't attempt to disambiguate anything like register/var naming collisions; we
+                # already don't really show classical variables.
+                if isinstance(var, expr.Var):
+                    return ast.Identifier(var.name)
+                if isinstance(var, ClassicalRegister):
+                    return ast.Identifier(var.name)
+                # Single clbit.  This is not actually the correct way to lookup a bit on the
+                # circuit (it doesn't handle bit bindings fully), but the text drawer doesn't
+                # completely track inner-outer _bit_ bindings, only inner-indices, so we can't fully
+                # recover the information losslessly.  Since most control-flow uses the control-flow
+                # builders, we should decay to something usable most of the time.
+                try:
+                    register, bit_index, reg_index = get_bit_reg_index(self._circuit, var)
+                except CircuitError:
+                    # We failed to find the bit due to binding problems - fall back to something
+                    # that's probably wrong, but at least disambiguating.
+                    return ast.Identifier(f"_bit{wire_map[var]}")
+                if register is None:
+                    return ast.Identifier(f"_bit{bit_index}")
+                return ast.SubscriptedIdentifier(register.name, ast.IntegerLiteral(reg_index))
+
             condition = node.op.target if isinstance(node.op, SwitchCaseOp) else node.op.condition
-            if self._builder is None:
-                self._builder = QASM3Builder(
-                    self._circuit,
-                    includeslist=("stdgates.inc",),
-                    basis_gates=("U",),
-                    disable_constants=False,
-                    allow_aliasing=False,
-                )
-                self._builder.build_classical_declarations()
+            draw_conditional = bool(node_resources(condition).clbits)
             stream = StringIO()
-            BasicPrinter(stream, indent="  ").visit(self._builder.build_expression(condition))
+            BasicPrinter(stream, indent="  ").visit(condition.accept(_ExprBuilder(lookup_var)))
             self._expr_text = stream.getvalue()
             # Truncate expr_text at 30 chars or user-set expr_len
             if len(self._expr_text) > self.expr_len:
                 self._expr_text = self._expr_text[: self.expr_len] + "..."
+        else:
+            draw_conditional = not isinstance(node.op, ForLoopOp)
 
         # # Draw a left box such as If, While, For, and Switch
-        flow_layer = self.draw_flow_box(node, wire_map, CF_LEFT)
+        flow_layer = self.draw_flow_box(node, wire_map, CF_LEFT, conditional=draw_conditional)
         layers.append(flow_layer.full_layer)
 
         # Get the list of circuits in the ControlFlowOp from the node blocks
@@ -1351,7 +1370,9 @@ def add_control_flow(self, node, layers, wire_map):
 
             if circ_num > 0:
                 # Draw a middle box such as Else and Case
-                flow_layer = self.draw_flow_box(node, flow_wire_map, CF_MID, circ_num - 1)
+                flow_layer = self.draw_flow_box(
+                    node, flow_wire_map, CF_MID, circ_num - 1, conditional=False
+                )
                 layers.append(flow_layer.full_layer)
 
             _, _, nodes = _get_layered_instructions(circuit, wire_map=flow_wire_map)
@@ -1380,14 +1401,13 @@ def add_control_flow(self, node, layers, wire_map):
                 layers.append(flow_layer2.full_layer)
 
         # Draw the right box for End
-        flow_layer = self.draw_flow_box(node, flow_wire_map, CF_RIGHT)
+        flow_layer = self.draw_flow_box(node, flow_wire_map, CF_RIGHT, conditional=False)
         layers.append(flow_layer.full_layer)
 
-    def draw_flow_box(self, node, flow_wire_map, section, circ_num=0):
+    def draw_flow_box(self, node, flow_wire_map, section, circ_num=0, conditional=False):
         """Draw the left, middle, or right of a control flow box"""
 
         op = node.op
-        conditional = section == CF_LEFT and not isinstance(op, ForLoopOp)
         depth = str(self._nest_depth)
         if section == CF_LEFT:
             etext = ""
diff --git a/qiskit/visualization/pulse_v2/interface.py b/qiskit/visualization/pulse_v2/interface.py
index 90e4fdfa60e6..75370905d8d9 100644
--- a/qiskit/visualization/pulse_v2/interface.py
+++ b/qiskit/visualization/pulse_v2/interface.py
@@ -27,8 +27,10 @@
 from qiskit.visualization.exceptions import VisualizationError
 from qiskit.visualization.pulse_v2 import core, device_info, stylesheet, types
 from qiskit.exceptions import MissingOptionalLibraryError
+from qiskit.utils import deprecate_arg
 
 
+@deprecate_arg("show_barriers", new_alias="plot_barriers", since="1.1.0", pending=True)
 def draw(
     program: Union[Waveform, SymbolicPulse, Schedule, ScheduleBlock],
     style: Optional[Dict[str, Any]] = None,
@@ -39,9 +41,10 @@ def draw(
     show_snapshot: bool = True,
     show_framechange: bool = True,
     show_waveform_info: bool = True,
-    show_barrier: bool = True,
+    plot_barrier: bool = True,
     plotter: str = types.Plotter.Mpl2D.value,
     axis: Optional[Any] = None,
+    show_barrier: bool = True,
 ):
     """Generate visualization data for pulse programs.
 
@@ -66,7 +69,7 @@ def draw(
             instructions that modulate phase or frequency of pulse channels.
         show_waveform_info: Show waveform annotations, i.e. name, of waveforms.
             Set ``True`` to show additional information about waveforms.
-        show_barrier: Show barrier lines.
+        plot_barrier: Show barrier lines.
         plotter: Name of plotter API to generate an output image.
             One of following APIs should be specified::
 
@@ -79,6 +82,7 @@ def draw(
             the plotters use a given ``axis`` instead of internally initializing
             a figure object. This object format depends on the plotter.
             See plotter argument for details.
+        show_barrier: DEPRECATED. Show barrier lines.
 
     Returns:
         Visualization output data.
@@ -379,6 +383,7 @@ def draw(
         MissingOptionalLibraryError: When required visualization package is not installed.
         VisualizationError: When invalid plotter API or invalid time range is specified.
     """
+    del show_barrier
     temp_style = stylesheet.QiskitPulseStyle()
     temp_style.update(style or stylesheet.IQXStandard())
 
@@ -425,7 +430,7 @@ def draw(
         canvas.set_disable_type(types.LabelType.PULSE_NAME, remove=True)
 
     # show barrier
-    if not show_barrier:
+    if not plot_barrier:
         canvas.set_disable_type(types.LineType.BARRIER, remove=True)
 
     canvas.update()
diff --git a/qiskit/visualization/timeline/interface.py b/qiskit/visualization/timeline/interface.py
index ef0f072c3882..f5bb7e16f8d3 100644
--- a/qiskit/visualization/timeline/interface.py
+++ b/qiskit/visualization/timeline/interface.py
@@ -25,21 +25,27 @@
 from qiskit.exceptions import MissingOptionalLibraryError
 from qiskit.visualization.exceptions import VisualizationError
 from qiskit.visualization.timeline import types, core, stylesheet
+from qiskit.utils import deprecate_arg
 
 
+@deprecate_arg("show_idle", new_alias="idle_wires", since="1.1.0", pending=True)
+@deprecate_arg("show_barriers", new_alias="plot_barriers", since="1.1.0", pending=True)
 def draw(
     program: circuit.QuantumCircuit,
     style: Optional[Dict[str, Any]] = None,
     time_range: Tuple[int, int] = None,
     disable_bits: List[types.Bits] = None,
     show_clbits: Optional[bool] = None,
-    show_idle: Optional[bool] = None,
-    show_barriers: Optional[bool] = None,
+    idle_wires: Optional[bool] = None,
+    plot_barriers: Optional[bool] = None,
     show_delays: Optional[bool] = None,
     show_labels: bool = True,
     plotter: Optional[str] = types.Plotter.MPL.value,
     axis: Optional[Any] = None,
     filename: Optional[str] = None,
+    *,
+    show_idle: Optional[bool] = None,
+    show_barriers: Optional[bool] = None,
 ):
     r"""Generate visualization data for scheduled circuit programs.
 
@@ -55,9 +61,9 @@ def draw(
         disable_bits: List of qubits of classical bits not shown in the output image.
         show_clbits: A control property to show classical bits.
             Set `True` to show classical bits.
-        show_idle: A control property to show idle timeline.
+        idle_wires: A control property to show idle timeline.
             Set `True` to show timeline without gates.
-        show_barriers: A control property to show barrier instructions.
+        plot_barriers: A control property to show barrier instructions.
             Set `True` to show barrier instructions.
         show_delays: A control property to show delay instructions.
             Set `True` to show delay instructions.
@@ -75,6 +81,8 @@ def draw(
             the plotters uses given `axis` instead of internally initializing a figure object.
             This object format depends on the plotter. See plotters section for details.
         filename: If provided the output image is dumped into a file under the filename.
+        show_idle: DEPRECATED.
+        show_barriers: DEPRECATED.
 
     Returns:
         Visualization output data.
@@ -347,19 +355,21 @@ def draw(
         This feature enables you to control the most of appearance of the output image
         without modifying the codebase of the scheduled circuit drawer.
     """
+    del show_idle
+    del show_barriers
     # update stylesheet
     temp_style = stylesheet.QiskitTimelineStyle()
     temp_style.update(style or stylesheet.IQXStandard())
 
     # update control properties
-    if show_idle is not None:
-        temp_style["formatter.control.show_idle"] = show_idle
+    if idle_wires is not None:
+        temp_style["formatter.control.show_idle"] = idle_wires
 
     if show_clbits is not None:
         temp_style["formatter.control.show_clbits"] = show_clbits
 
-    if show_barriers is not None:
-        temp_style["formatter.control.show_barriers"] = show_barriers
+    if plot_barriers is not None:
+        temp_style["formatter.control.show_barriers"] = plot_barriers
 
     if show_delays is not None:
         temp_style["formatter.control.show_delays"] = show_delays
diff --git a/releasenotes/notes/add-run-all-plugins-option-ba8806a269e5713c.yaml b/releasenotes/notes/add-run-all-plugins-option-ba8806a269e5713c.yaml
new file mode 100644
index 000000000000..2ab34c61fb35
--- /dev/null
+++ b/releasenotes/notes/add-run-all-plugins-option-ba8806a269e5713c.yaml
@@ -0,0 +1,51 @@
+---
+features:
+  - |
+    The :class:`~.HLSConfig` now has two additional optional arguments. The argument
+    ``plugin_selection`` can be set either to ``"sequential"`` or to ``"all"``.
+    If set to "sequential" (default), for every higher-level-object
+    the :class:`~qiskit.transpiler.passes.HighLevelSynthesis` pass will consider the
+    specified methods sequentially, in the order they appear in the list, stopping
+    at the first method that is able to synthesize the object. If set to "all",
+    all the specified methods will be considered, and the best synthesized circuit,
+    according to ``plugin_evaluation_fn`` will be chosen. The argument
+    ``plugin_evaluation_fn`` is an optional callable that evaluates the quality of
+    the synthesized quantum circuit; a smaller value means a better circuit. When
+    set to ``None``, the quality of the circuit is its size (i.e. the number of gates
+    that it contains).
+
+    The following example illustrates the new functionality::
+
+        from qiskit import QuantumCircuit
+        from qiskit.circuit.library import LinearFunction
+        from qiskit.synthesis.linear import random_invertible_binary_matrix
+        from qiskit.transpiler.passes import HighLevelSynthesis, HLSConfig
+
+        # Create a circuit with a linear function
+        mat = random_invertible_binary_matrix(7, seed=37)
+        qc = QuantumCircuit(7)
+        qc.append(LinearFunction(mat), [0, 1, 2, 3, 4, 5, 6])
+
+        # Run different methods with different parameters,
+        # choosing the best result in terms of depth.
+        hls_config = HLSConfig(
+            linear_function=[
+                ("pmh", {}),
+                ("pmh", {"use_inverted": True}),
+                ("pmh", {"use_transposed": True}),
+                ("pmh", {"use_inverted": True, "use_transposed": True}),
+                ("pmh", {"section_size": 1}),
+                ("pmh", {"section_size": 3}),
+                ("kms", {}),
+                ("kms", {"use_inverted": True}),
+            ],
+            plugin_selection="all",
+            plugin_evaluation_fn=lambda circuit: circuit.depth(),
+        )
+
+        # synthesize
+        qct = HighLevelSynthesis(hls_config=hls_config)(qc)
+
+    In the example, we run multiple synthesis methods with different parameters,
+    choosing the best circuit in terms of depth. Note that optimizing
+    ``circuit.size()`` instead would pick a different circuit.
diff --git a/releasenotes/notes/add-scheduler-warnings-da6968a39fd8e6e7.yaml b/releasenotes/notes/add-scheduler-warnings-da6968a39fd8e6e7.yaml
new file mode 100644
index 000000000000..be2c94140300
--- /dev/null
+++ b/releasenotes/notes/add-scheduler-warnings-da6968a39fd8e6e7.yaml
@@ -0,0 +1,11 @@
+fixes:
+  - |
+    Fixed an issue where the :class:`.ConstrainedReschedule` transpiler pass would previously error
+    if the circuit contained a :class:`~.circuit.Reset` instruction. This has been corrected so that the
+    pass no longer errors, however an actual hardware may behave differently from 
+    what Qiskit scheduler assumes especially for 
+    mid-circuit measurements and resets.
+    Qiskit scheduler raises ``RuntimeWarning`` if
+    it encounters circuit containing either.
+    Fixed `#10354 `__
+
diff --git a/releasenotes/notes/add-use-dag-flag-two-qubit-basis-decomposer-024a9ced9833289c.yaml b/releasenotes/notes/add-use-dag-flag-two-qubit-basis-decomposer-024a9ced9833289c.yaml
new file mode 100644
index 000000000000..4607560d96a0
--- /dev/null
+++ b/releasenotes/notes/add-use-dag-flag-two-qubit-basis-decomposer-024a9ced9833289c.yaml
@@ -0,0 +1,18 @@
+---
+features_synthesis:
+  - |
+    Added a new argument, ``use_dag``, to the :meth:`.TwoQubitBasisDecomposer.__call__`
+    and :meth:`.XXDecomposer.__call__` methods. This argument is used to control whether
+    a :class:`.DAGCircuit` is returned when calling a :class:`.TwoQubitBasisDecomposer`
+    or :class:`.XXDecomposer` instance instead of the default :class:`.QuantumCircuit`.
+    For example::
+
+        from qiskit.circuit.library import CXGate
+        from qiskit.quantum_info import random_unitary
+        from qiskit.synthesis import TwoQubitBasisDecomposer
+
+        decomposer = TwoQubitBasisDecomposer(CXGate(), euler_basis="PSX")
+        decomposer(random_unitary(4), use_dag=True)
+
+    will return a :class:`.DAGCircuit` when calling the :class:`.TwoQubitBasisDecomposer`
+    instance ``decomposer``.
diff --git a/releasenotes/notes/fix-backend-primitives-performance-1409b08ccc2a5ce9.yaml b/releasenotes/notes/fix-backend-primitives-performance-1409b08ccc2a5ce9.yaml
new file mode 100644
index 000000000000..883122d6f5b4
--- /dev/null
+++ b/releasenotes/notes/fix-backend-primitives-performance-1409b08ccc2a5ce9.yaml
@@ -0,0 +1,6 @@
+---
+fixes:
+  - |
+    Fixed a performance issue in the :class:`~.BackendSamplerV2` and
+    :class:`~.BackendEstimatorV2`.
+    Fixed `#12290 `__
\ No newline at end of file
diff --git a/releasenotes/notes/fixes_10852-e197344c5f44b4f1.yaml b/releasenotes/notes/fixes_10852-e197344c5f44b4f1.yaml
new file mode 100644
index 000000000000..755403d98a32
--- /dev/null
+++ b/releasenotes/notes/fixes_10852-e197344c5f44b4f1.yaml
@@ -0,0 +1,5 @@
+---
+features_providers:
+  - |
+    The :class:`.BasicSimulator` python-based simulator included in :mod:`qiskit.providers.basic_provider`
+    now includes all the standard gates (:mod:`qiskit.circuit.library .standard_gates`) up to 3 qubits.
diff --git a/releasenotes/notes/macos-arm64-tier-1-c5030f009be6adcb.yaml b/releasenotes/notes/macos-arm64-tier-1-c5030f009be6adcb.yaml
new file mode 100644
index 000000000000..b59f5b9844c9
--- /dev/null
+++ b/releasenotes/notes/macos-arm64-tier-1-c5030f009be6adcb.yaml
@@ -0,0 +1,11 @@
+---
+other:
+  - |
+    Support for the arm64 macOS platform has been promoted from Tier 3
+    to Tier 1. Previously the platform was at Tier 3 because there was
+    no available CI environment for testing Qiskit on the platform. Now
+    that Github has made an arm64 macOS environment available to open source
+    projects [#]_ we're testing the platform along with the other Tier 1
+    supported platforms.
+
+    .. [#] https://github.blog/changelog/2024-01-30-github-actions-introducing-the-new-m1-macos-runner-available-to-open-source/
diff --git a/releasenotes/notes/nlocal-perf-3b8ebd9be1b2f4b3.yaml b/releasenotes/notes/nlocal-perf-3b8ebd9be1b2f4b3.yaml
new file mode 100644
index 000000000000..9f8afa4e859a
--- /dev/null
+++ b/releasenotes/notes/nlocal-perf-3b8ebd9be1b2f4b3.yaml
@@ -0,0 +1,9 @@
+---
+features_circuits:
+  - |
+    The construction performance of :class:`.NLocal` and its derived circuit-library subclasses
+    (e.g. :class:`.EfficientSU2` and :class:`.RealAmplitudes`) has significantly improved, when the
+    rotation and/or entanglement subblocks are simple applications of a single Qiskit
+    standard-library gate.  Since these circuits are constructed lazily, you might not see the
+    improvement immediately on instantiation of the class, but instead on first access to its
+    internal structure.  Performance improvements are on the order of ten times faster.
diff --git a/releasenotes/notes/numpy-2.0-2f3e35bd42c48518.yaml b/releasenotes/notes/numpy-2.0-2f3e35bd42c48518.yaml
new file mode 100644
index 000000000000..3595f2f936bd
--- /dev/null
+++ b/releasenotes/notes/numpy-2.0-2f3e35bd42c48518.yaml
@@ -0,0 +1,5 @@
+---
+features:
+  - |
+    This release of Qiskit finalizes support for NumPy 2.0.  Qiskit will continue to support both
+    Numpy 1.x and 2.x for the foreseeable future.
diff --git a/releasenotes/notes/operator-from-circuit-bugfix-5dab5993526a2b0a.yaml b/releasenotes/notes/operator-from-circuit-bugfix-5dab5993526a2b0a.yaml
new file mode 100644
index 000000000000..759f023efc87
--- /dev/null
+++ b/releasenotes/notes/operator-from-circuit-bugfix-5dab5993526a2b0a.yaml
@@ -0,0 +1,7 @@
+---
+fixes:
+  - |
+    Fixed an issue with the :meth:`.Operator.from_circuit` constructor method where it would incorrectly
+    interpret the final layout permutation resulting in an invalid `Operator` being constructed.
+    Previously, the final layout was processed without regards for the initial layout, i.e. the
+    initialization was incorrect for all quantum circuits that have a non-trivial initial layout.
diff --git a/releasenotes/notes/parameter_assignment_by_name_for_pulse_schedules-3a27bbbbf235fb9e.yaml b/releasenotes/notes/parameter_assignment_by_name_for_pulse_schedules-3a27bbbbf235fb9e.yaml
new file mode 100644
index 000000000000..551ea9e918c6
--- /dev/null
+++ b/releasenotes/notes/parameter_assignment_by_name_for_pulse_schedules-3a27bbbbf235fb9e.yaml
@@ -0,0 +1,8 @@
+---
+features_pulse:
+  - |
+    It is now possible to assign parameters to pulse :class:`.Schedule`and :class:`.ScheduleBlock` objects by specifying
+    the parameter name as a string. The parameter name can be used to assign values to all parameters within the
+    `Schedule` or `ScheduleBlock` that have the same name. Moreover, the parameter name of a `ParameterVector`
+    can be used to assign all values of the vector simultaneously (the list of values should therefore match the
+    length of the vector).
diff --git a/releasenotes/notes/qasm3-parameter-gate-clash-34ef7b0383849a78.yaml b/releasenotes/notes/qasm3-parameter-gate-clash-34ef7b0383849a78.yaml
new file mode 100644
index 000000000000..217fbc464121
--- /dev/null
+++ b/releasenotes/notes/qasm3-parameter-gate-clash-34ef7b0383849a78.yaml
@@ -0,0 +1,7 @@
+---
+fixes:
+  - |
+    :class:`.Parameter` instances used as stand-ins for ``input`` variables in
+    OpenQASM 3 programs will now have their names escaped to avoid collisions
+    with built-in gates during the export to OpenQASM 3.  Previously there
+    could be a naming clash, and the exporter would generate invalid OpenQASM 3.
diff --git a/releasenotes/notes/reverse-permutation-lnn-409a07c7f6d0eed9.yaml b/releasenotes/notes/reverse-permutation-lnn-409a07c7f6d0eed9.yaml
new file mode 100644
index 000000000000..357345adfa26
--- /dev/null
+++ b/releasenotes/notes/reverse-permutation-lnn-409a07c7f6d0eed9.yaml
@@ -0,0 +1,8 @@
+---
+features_synthesis:
+  - |
+    Add a new synthesis method :func:`.synth_permutation_reverse_lnn_kms`
+    of reverse permutations for linear nearest-neighbor architectures using
+    Kutin, Moulton, Smithline method.
+    This algorithm synthesizes the reverse permutation on :math:`n` qubits over
+    a linear nearest-neighbor architecture using CX gates with depth :math:`2 * n + 2`.
diff --git a/releasenotes/notes/rework-inst-durations-passes-28c78401682e22c0.yaml b/releasenotes/notes/rework-inst-durations-passes-28c78401682e22c0.yaml
new file mode 100644
index 000000000000..2ccd92f19c14
--- /dev/null
+++ b/releasenotes/notes/rework-inst-durations-passes-28c78401682e22c0.yaml
@@ -0,0 +1,15 @@
+---
+fixes:
+  - |
+    The internal handling of custom circuit calibrations and :class:`.InstructionDurations`
+    has been offloaded from the :func:`.transpile` function to the individual transpiler passes: 
+    :class:`qiskit.transpiler.passes.scheduling.DynamicalDecoupling`,
+    :class:`qiskit.transpiler.passes.scheduling.padding.DynamicalDecoupling`. Before, 
+    instruction durations from circuit calibrations would not be taken into account unless 
+    they were manually incorporated into `instruction_durations` input argument, but the passes
+    that need it now analyze the circuit and pick the most relevant duration value according 
+    to the following priority order: target > custom input > circuit calibrations.
+
+  - |
+    Fixed a bug in :func:`.transpile` where the ``num_processes`` argument would only be used
+    if ``dt`` or ``instruction_durations`` were provided. 
\ No newline at end of file
diff --git a/releasenotes/notes/show_idle_and_show_barrier-6e77e1f9d6f55599.yaml b/releasenotes/notes/show_idle_and_show_barrier-6e77e1f9d6f55599.yaml
new file mode 100644
index 000000000000..ac0994b9d3a9
--- /dev/null
+++ b/releasenotes/notes/show_idle_and_show_barrier-6e77e1f9d6f55599.yaml
@@ -0,0 +1,6 @@
+---
+deprecations:
+  - |
+    The parameters ``show_idle`` and ``show_barrier`` in the timeline drawers had been replaced by ``idle_wires`` and ``plot_barriers``
+    respectively to match the circuit drawer parameters. Their previous names are now deprecated and will be removed in the next major
+    release. The new parameters are fully equivalent.
diff --git a/releasenotes/notes/spo-to-matrix-26445a791e24f62a.yaml b/releasenotes/notes/spo-to-matrix-26445a791e24f62a.yaml
new file mode 100644
index 000000000000..135e83ef99b1
--- /dev/null
+++ b/releasenotes/notes/spo-to-matrix-26445a791e24f62a.yaml
@@ -0,0 +1,8 @@
+---
+features:
+  - |
+    The performance of :meth:`.SparsePauliOp.to_matrix` has been greatly improved for both dense and
+    sparse forms.  By default, both will now take advantage of threaded parallelism available on
+    your system, subject to the ``RAYON_NUM_THREADS`` environment variable.  You can temporarily
+    force serial execution using the new ``force_serial`` Boolean argument to
+    :meth:`~.SparsePauliOp.to_matrix`.
diff --git a/releasenotes/notes/use-target-in-transpile-7c04b14549a11f40.yaml b/releasenotes/notes/use-target-in-transpile-7c04b14549a11f40.yaml
new file mode 100644
index 000000000000..8e385ba01949
--- /dev/null
+++ b/releasenotes/notes/use-target-in-transpile-7c04b14549a11f40.yaml
@@ -0,0 +1,8 @@
+---
+upgrade:
+  - |
+    The :func:`.transpile` function has been upgraded to internally convert 
+    `backend` inputs of type :class:`.BackendV1` to :class:`.BackendV2`, 
+    which allows the transpilation pipeline to now access the backend 
+    constraints through a :class:`.Target`. This change does not require any 
+    user action.
\ No newline at end of file
diff --git a/requirements.txt b/requirements.txt
index 0a4f0f32c5a9..539f9587994d 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,9 +1,9 @@
 rustworkx>=0.14.0
-numpy>=1.17,<2
+numpy>=1.17,<3
 scipy>=1.5
 sympy>=1.3
 dill>=0.3
 python-dateutil>=2.8.0
 stevedore>=3.0.0
 typing-extensions
-symengine>=0.11
\ No newline at end of file
+symengine>=0.11
diff --git a/test/python/circuit/test_circuit_load_from_qpy.py b/test/python/circuit/test_circuit_load_from_qpy.py
index 766d555bda59..efae9697f187 100644
--- a/test/python/circuit/test_circuit_load_from_qpy.py
+++ b/test/python/circuit/test_circuit_load_from_qpy.py
@@ -22,7 +22,7 @@
 import numpy as np
 
 from qiskit import QuantumCircuit, QuantumRegister, ClassicalRegister, pulse
-from qiskit.circuit import CASE_DEFAULT
+from qiskit.circuit import CASE_DEFAULT, IfElseOp, WhileLoopOp, SwitchCaseOp
 from qiskit.circuit.classical import expr, types
 from qiskit.circuit.classicalregister import Clbit
 from qiskit.circuit.quantumregister import Qubit
@@ -57,7 +57,7 @@
 from qiskit.circuit.parameter import Parameter
 from qiskit.circuit.parametervector import ParameterVector
 from qiskit.synthesis import LieTrotter, SuzukiTrotter
-from qiskit.qpy import dump, load
+from qiskit.qpy import dump, load, UnsupportedFeatureForVersion, QPY_COMPATIBILITY_VERSION
 from qiskit.quantum_info import Pauli, SparsePauliOp, Clifford
 from qiskit.quantum_info.random import random_unitary
 from qiskit.circuit.controlledgate import ControlledGate
@@ -84,6 +84,26 @@ def assertDeprecatedBitProperties(self, original, roundtripped):
             original_clbits, roundtripped_clbits = zip(*owned_clbits)
             self.assertEqual(original_clbits, roundtripped_clbits)
 
+    def assertMinimalVarEqual(self, left, right):
+        """Replacement for asserting `QuantumCircuit` equality for use in `Var` tests, for use while
+        the `DAGCircuit` does not yet allow full equality checks.  This should be removed and the
+        tests changed to directly call `assertEqual` once possible.
+
+        This filters out instructions that have `QuantumCircuit` parameters in the data comparison
+        (such as control-flow ops), which need to be handled separately."""
+        self.assertEqual(list(left.iter_input_vars()), list(right.iter_input_vars()))
+        self.assertEqual(list(left.iter_declared_vars()), list(right.iter_declared_vars()))
+        self.assertEqual(list(left.iter_captured_vars()), list(right.iter_captured_vars()))
+
+        def filter_ops(data):
+            return [
+                ins
+                for ins in data
+                if not any(isinstance(x, QuantumCircuit) for x in ins.operation.params)
+            ]
+
+        self.assertEqual(filter_ops(left.data), filter_ops(right.data))
+
     def test_qpy_full_path(self):
         """Test full path qpy serialization for basic circuit."""
         qr_a = QuantumRegister(4, "a")
@@ -1760,6 +1780,152 @@ def test_annotated_operations_iterative(self):
             new_circuit = load(fptr)[0]
         self.assertEqual(circuit, new_circuit)
 
+    def test_load_empty_vars(self):
+        """Test loading empty circuits with variables."""
+        a = expr.Var.new("a", types.Bool())
+        b = expr.Var.new("b", types.Uint(8))
+        all_vars = {
+            a: expr.lift(False),
+            b: expr.lift(3, type=b.type),
+            expr.Var.new("θψφ", types.Bool()): expr.logic_not(a),
+            expr.Var.new("🐍🐍🐍", types.Uint(8)): expr.bit_and(b, b),
+        }
+
+        inputs = QuantumCircuit(inputs=list(all_vars))
+        with io.BytesIO() as fptr:
+            dump(inputs, fptr)
+            fptr.seek(0)
+            new_inputs = load(fptr)[0]
+        self.assertMinimalVarEqual(inputs, new_inputs)
+        self.assertDeprecatedBitProperties(inputs, new_inputs)
+
+        # Reversed order just to check there's no sorting shenanigans.
+        captures = QuantumCircuit(captures=list(all_vars)[::-1])
+        with io.BytesIO() as fptr:
+            dump(captures, fptr)
+            fptr.seek(0)
+            new_captures = load(fptr)[0]
+        self.assertMinimalVarEqual(captures, new_captures)
+        self.assertDeprecatedBitProperties(captures, new_captures)
+
+        declares = QuantumCircuit(declarations=all_vars)
+        with io.BytesIO() as fptr:
+            dump(declares, fptr)
+            fptr.seek(0)
+            new_declares = load(fptr)[0]
+        self.assertMinimalVarEqual(declares, new_declares)
+        self.assertDeprecatedBitProperties(declares, new_declares)
+
+    def test_load_empty_vars_if(self):
+        """Test loading circuit with vars in if/else closures."""
+        a = expr.Var.new("a", types.Bool())
+        b = expr.Var.new("θψφ", types.Bool())
+        c = expr.Var.new("c", types.Uint(8))
+        d = expr.Var.new("🐍🐍🐍", types.Uint(8))
+
+        qc = QuantumCircuit(inputs=[a])
+        qc.add_var(b, expr.logic_not(a))
+        qc.add_var(c, expr.lift(0, c.type))
+        with qc.if_test(b) as else_:
+            qc.store(c, expr.lift(3, c.type))
+        with else_:
+            qc.add_var(d, expr.lift(7, d.type))
+
+        with io.BytesIO() as fptr:
+            dump(qc, fptr)
+            fptr.seek(0)
+            new_qc = load(fptr)[0]
+        self.assertMinimalVarEqual(qc, new_qc)
+        self.assertDeprecatedBitProperties(qc, new_qc)
+
+        old_if_else = qc.data[-1].operation
+        new_if_else = new_qc.data[-1].operation
+        # Sanity check for test.
+        self.assertIsInstance(old_if_else, IfElseOp)
+        self.assertIsInstance(new_if_else, IfElseOp)
+        self.assertEqual(len(old_if_else.blocks), len(new_if_else.blocks))
+
+        for old, new in zip(old_if_else.blocks, new_if_else.blocks):
+            self.assertMinimalVarEqual(old, new)
+            self.assertDeprecatedBitProperties(old, new)
+
+    def test_load_empty_vars_while(self):
+        """Test loading circuit with vars in while closures."""
+        a = expr.Var.new("a", types.Bool())
+        b = expr.Var.new("θψφ", types.Bool())
+        c = expr.Var.new("🐍🐍🐍", types.Uint(8))
+
+        qc = QuantumCircuit(inputs=[a])
+        qc.add_var(b, expr.logic_not(a))
+        with qc.while_loop(b):
+            qc.add_var(c, expr.lift(7, c.type))
+
+        with io.BytesIO() as fptr:
+            dump(qc, fptr)
+            fptr.seek(0)
+            new_qc = load(fptr)[0]
+        self.assertMinimalVarEqual(qc, new_qc)
+        self.assertDeprecatedBitProperties(qc, new_qc)
+
+        old_while = qc.data[-1].operation
+        new_while = new_qc.data[-1].operation
+        # Sanity check for test.
+        self.assertIsInstance(old_while, WhileLoopOp)
+        self.assertIsInstance(new_while, WhileLoopOp)
+        self.assertEqual(len(old_while.blocks), len(new_while.blocks))
+
+        for old, new in zip(old_while.blocks, new_while.blocks):
+            self.assertMinimalVarEqual(old, new)
+            self.assertDeprecatedBitProperties(old, new)
+
+    def test_load_empty_vars_switch(self):
+        """Test loading circuit with vars in switch closures."""
+        a = expr.Var.new("🐍🐍🐍", types.Uint(8))
+
+        qc = QuantumCircuit(1, 1, inputs=[a])
+        qc.measure(0, 0)
+        b_outer = qc.add_var("b", False)
+        with qc.switch(a) as case:
+            with case(0):
+                qc.store(b_outer, True)
+            with case(1):
+                qc.store(qc.clbits[0], False)
+            with case(2):
+                # Explicit shadowing.
+                qc.add_var("b", True)
+            with case(3):
+                qc.store(a, expr.lift(1, a.type))
+            with case(case.DEFAULT):
+                pass
+
+        with io.BytesIO() as fptr:
+            dump(qc, fptr)
+            fptr.seek(0)
+            new_qc = load(fptr)[0]
+        self.assertMinimalVarEqual(qc, new_qc)
+        self.assertDeprecatedBitProperties(qc, new_qc)
+
+        old_switch = qc.data[-1].operation
+        new_switch = new_qc.data[-1].operation
+        # Sanity check for test.
+        self.assertIsInstance(old_switch, SwitchCaseOp)
+        self.assertIsInstance(new_switch, SwitchCaseOp)
+        self.assertEqual(len(old_switch.blocks), len(new_switch.blocks))
+
+        for old, new in zip(old_switch.blocks, new_switch.blocks):
+            self.assertMinimalVarEqual(old, new)
+            self.assertDeprecatedBitProperties(old, new)
+
+    @ddt.idata(range(QPY_COMPATIBILITY_VERSION, 12))
+    def test_pre_v12_rejects_standalone_var(self, version):
+        """Test that dumping to older QPY versions rejects standalone vars."""
+        a = expr.Var.new("a", types.Bool())
+        qc = QuantumCircuit(inputs=[a])
+        with io.BytesIO() as fptr, self.assertRaisesRegex(
+            UnsupportedFeatureForVersion, "version 12 is required.*realtime variables"
+        ):
+            dump(qc, fptr, version=version)
+
 
 class TestSymengineLoadFromQPY(QiskitTestCase):
     """Test use of symengine in qpy set of methods."""
diff --git a/test/python/circuit/test_circuit_operations.py b/test/python/circuit/test_circuit_operations.py
index 483224196798..9a934d70c710 100644
--- a/test/python/circuit/test_circuit_operations.py
+++ b/test/python/circuit/test_circuit_operations.py
@@ -485,6 +485,69 @@ def test_copy_empty_variables(self):
         self.assertEqual({b, d}, set(copied.iter_captured_vars()))
         self.assertEqual({b}, set(qc.iter_captured_vars()))
 
+    def test_copy_empty_variables_alike(self):
+        """Test that an empty copy of circuits including variables copies them across, but does not
+        initialise them.  This is the same as the default, just spelled explicitly."""
+        a = expr.Var.new("a", types.Bool())
+        b = expr.Var.new("b", types.Uint(8))
+        c = expr.Var.new("c", types.Bool())
+        d = expr.Var.new("d", types.Uint(8))
+
+        qc = QuantumCircuit(inputs=[a], declarations=[(c, expr.lift(False))])
+        copied = qc.copy_empty_like(vars_mode="alike")
+        self.assertEqual({a}, set(copied.iter_input_vars()))
+        self.assertEqual({c}, set(copied.iter_declared_vars()))
+        self.assertEqual([], list(copied.data))
+
+        # Check that the original circuit is not mutated.
+        copied.add_input(b)
+        copied.add_var(d, 0xFF)
+        self.assertEqual({a, b}, set(copied.iter_input_vars()))
+        self.assertEqual({c, d}, set(copied.iter_declared_vars()))
+        self.assertEqual({a}, set(qc.iter_input_vars()))
+        self.assertEqual({c}, set(qc.iter_declared_vars()))
+
+        qc = QuantumCircuit(captures=[b], declarations=[(a, expr.lift(False)), (c, a)])
+        copied = qc.copy_empty_like(vars_mode="alike")
+        self.assertEqual({b}, set(copied.iter_captured_vars()))
+        self.assertEqual({a, c}, set(copied.iter_declared_vars()))
+        self.assertEqual([], list(copied.data))
+
+        # Check that the original circuit is not mutated.
+        copied.add_capture(d)
+        self.assertEqual({b, d}, set(copied.iter_captured_vars()))
+        self.assertEqual({b}, set(qc.iter_captured_vars()))
+
+    def test_copy_empty_variables_to_captures(self):
+        """``vars_mode="captures"`` should convert all variables to captures."""
+        a = expr.Var.new("a", types.Bool())
+        b = expr.Var.new("b", types.Uint(8))
+        c = expr.Var.new("c", types.Bool())
+        d = expr.Var.new("d", types.Uint(8))
+
+        qc = QuantumCircuit(inputs=[a, b], declarations=[(c, expr.lift(False))])
+        copied = qc.copy_empty_like(vars_mode="captures")
+        self.assertEqual({a, b, c}, set(copied.iter_captured_vars()))
+        self.assertEqual({a, b, c}, set(copied.iter_vars()))
+        self.assertEqual([], list(copied.data))
+
+        qc = QuantumCircuit(captures=[c, d])
+        copied = qc.copy_empty_like(vars_mode="captures")
+        self.assertEqual({c, d}, set(copied.iter_captured_vars()))
+        self.assertEqual({c, d}, set(copied.iter_vars()))
+        self.assertEqual([], list(copied.data))
+
+    def test_copy_empty_variables_drop(self):
+        """``vars_mode="drop"`` should not have variables in the output."""
+        a = expr.Var.new("a", types.Bool())
+        b = expr.Var.new("b", types.Uint(8))
+        c = expr.Var.new("c", types.Bool())
+
+        qc = QuantumCircuit(inputs=[a, b], declarations=[(c, expr.lift(False))])
+        copied = qc.copy_empty_like(vars_mode="drop")
+        self.assertEqual(set(), set(copied.iter_vars()))
+        self.assertEqual([], list(copied.data))
+
     def test_copy_empty_like_parametric_phase(self):
         """Test that the parameter table of an empty circuit remains valid after copying a circuit
         with a parametric global phase."""
diff --git a/test/python/circuit/test_circuit_vars.py b/test/python/circuit/test_circuit_vars.py
index 0da541085366..8b7167eed7e1 100644
--- a/test/python/circuit/test_circuit_vars.py
+++ b/test/python/circuit/test_circuit_vars.py
@@ -14,7 +14,7 @@
 
 from test import QiskitTestCase
 
-from qiskit.circuit import QuantumCircuit, CircuitError, Clbit, ClassicalRegister
+from qiskit.circuit import QuantumCircuit, CircuitError, Clbit, ClassicalRegister, Store
 from qiskit.circuit.classical import expr, types
 
 
@@ -241,6 +241,30 @@ def test_initialise_declarations_equal_to_add_var(self):
         self.assertEqual(list(qc_init.iter_vars()), list(qc_manual.iter_vars()))
         self.assertEqual(qc_init.data, qc_manual.data)
 
+    def test_declarations_widen_integer_literals(self):
+        a = expr.Var.new("a", types.Uint(8))
+        b = expr.Var.new("b", types.Uint(16))
+        qc = QuantumCircuit(declarations=[(a, 3)])
+        qc.add_var(b, 5)
+        actual_initializers = [
+            (op.lvalue, op.rvalue)
+            for instruction in qc
+            if isinstance((op := instruction.operation), Store)
+        ]
+        expected_initializers = [
+            (a, expr.Value(3, types.Uint(8))),
+            (b, expr.Value(5, types.Uint(16))),
+        ]
+        self.assertEqual(actual_initializers, expected_initializers)
+
+    def test_declaration_does_not_widen_bool_literal(self):
+        # `bool` is a subclass of `int` in Python (except some arithmetic operations have different
+        # semantics...).  It's not in Qiskit's value type system, though.
+        a = expr.Var.new("a", types.Uint(8))
+        qc = QuantumCircuit()
+        with self.assertRaisesRegex(CircuitError, "explicit cast is required"):
+            qc.add_var(a, True)
+
     def test_cannot_shadow_vars(self):
         """Test that exact duplicate ``Var`` nodes within different combinations of the inputs are
         detected and rejected."""
diff --git a/test/python/circuit/test_compose.py b/test/python/circuit/test_compose.py
index 03301899a6ae..0e481c12b33b 100644
--- a/test/python/circuit/test_compose.py
+++ b/test/python/circuit/test_compose.py
@@ -34,7 +34,7 @@
     CircuitError,
 )
 from qiskit.circuit.library import HGate, RZGate, CXGate, CCXGate, TwoLocal
-from qiskit.circuit.classical import expr
+from qiskit.circuit.classical import expr, types
 from test import QiskitTestCase  # pylint: disable=wrong-import-order
 
 
@@ -901,6 +901,118 @@ def test_expr_target_is_mapped(self):
 
         self.assertEqual(dest, expected)
 
+    def test_join_unrelated_vars(self):
+        """Composing disjoint sets of vars should produce an additive output."""
+        a = expr.Var.new("a", types.Bool())
+        b = expr.Var.new("b", types.Uint(8))
+
+        base = QuantumCircuit(inputs=[a])
+        other = QuantumCircuit(inputs=[b])
+        out = base.compose(other)
+        self.assertEqual({a, b}, set(out.iter_vars()))
+        self.assertEqual({a, b}, set(out.iter_input_vars()))
+        # Assert that base was unaltered.
+        self.assertEqual({a}, set(base.iter_vars()))
+
+        base = QuantumCircuit(captures=[a])
+        other = QuantumCircuit(captures=[b])
+        out = base.compose(other)
+        self.assertEqual({a, b}, set(out.iter_vars()))
+        self.assertEqual({a, b}, set(out.iter_captured_vars()))
+        self.assertEqual({a}, set(base.iter_vars()))
+
+        base = QuantumCircuit(inputs=[a])
+        other = QuantumCircuit(declarations=[(b, 255)])
+        out = base.compose(other)
+        self.assertEqual({a, b}, set(out.iter_vars()))
+        self.assertEqual({a}, set(out.iter_input_vars()))
+        self.assertEqual({b}, set(out.iter_declared_vars()))
+
+    def test_var_remap_to_avoid_collisions(self):
+        """We can use `var_remap` to avoid a variable collision."""
+        a1 = expr.Var.new("a", types.Bool())
+        a2 = expr.Var.new("a", types.Bool())
+        b = expr.Var.new("b", types.Bool())
+
+        base = QuantumCircuit(inputs=[a1])
+        other = QuantumCircuit(inputs=[a2])
+
+        out = base.compose(other, var_remap={a2: b})
+        self.assertEqual([a1, b], list(out.iter_input_vars()))
+        self.assertEqual([a1, b], list(out.iter_vars()))
+
+        out = base.compose(other, var_remap={"a": b})
+        self.assertEqual([a1, b], list(out.iter_input_vars()))
+        self.assertEqual([a1, b], list(out.iter_vars()))
+
+        out = base.compose(other, var_remap={"a": "c"})
+        self.assertTrue(out.has_var("c"))
+        c = out.get_var("c")
+        self.assertEqual(c.name, "c")
+        self.assertEqual([a1, c], list(out.iter_input_vars()))
+        self.assertEqual([a1, c], list(out.iter_vars()))
+
+    def test_simple_inline_captures(self):
+        """We should be able to inline captures onto other variables."""
+        a = expr.Var.new("a", types.Bool())
+        b = expr.Var.new("b", types.Bool())
+        c = expr.Var.new("c", types.Uint(8))
+
+        base = QuantumCircuit(inputs=[a, b])
+        base.add_var(c, 255)
+        base.store(a, expr.logic_or(a, b))
+        other = QuantumCircuit(captures=[a, b, c])
+        other.store(c, 254)
+        other.store(b, expr.logic_or(a, b))
+        new = base.compose(other, inline_captures=True)
+
+        expected = QuantumCircuit(inputs=[a, b])
+        expected.add_var(c, 255)
+        expected.store(a, expr.logic_or(a, b))
+        expected.store(c, 254)
+        expected.store(b, expr.logic_or(a, b))
+        self.assertEqual(new, expected)
+
+    def test_can_inline_a_capture_after_remapping(self):
+        """We can use `var_remap` to redefine a capture variable _and then_ inline it in deeply
+        nested scopes.  This is a stress test of capture inlining."""
+        a = expr.Var.new("a", types.Bool())
+        b = expr.Var.new("b", types.Bool())
+        c = expr.Var.new("c", types.Uint(8))
+
+        # We shouldn't be able to inline `qc`'s variable use as-is because it closes over the wrong
+        # variable, but it should work after variable remapping.  (This isn't expected to be super
+        # useful, it's just a consequence of how the order between `var_remap` and `inline_captures`
+        # is defined).
+        base = QuantumCircuit(inputs=[a])
+        qc = QuantumCircuit(declarations=[(c, 255)], captures=[b])
+        qc.store(b, expr.logic_and(b, b))
+        with qc.if_test(expr.logic_not(b)):
+            with qc.while_loop(b):
+                qc.store(b, expr.logic_not(b))
+            # Note that 'c' is captured in this scope, so this is also a test that 'inline_captures'
+            # doesn't do something silly in nested scopes.
+            with qc.switch(c) as case:
+                with case(0):
+                    qc.store(c, expr.bit_and(c, 255))
+                with case(case.DEFAULT):
+                    qc.store(b, expr.equal(c, 255))
+        base.compose(qc, inplace=True, inline_captures=True, var_remap={b: a})
+
+        expected = QuantumCircuit(inputs=[a], declarations=[(c, 255)])
+        expected.store(a, expr.logic_and(a, a))
+        with expected.if_test(expr.logic_not(a)):
+            with expected.while_loop(a):
+                expected.store(a, expr.logic_not(a))
+            # Note that 'c' is not remapped.
+            with expected.switch(c) as case:
+                with case(0):
+                    expected.store(c, expr.bit_and(c, 255))
+                with case(case.DEFAULT):
+                    expected.store(a, expr.equal(c, 255))
+
+        self.assertEqual(base, expected)
+
     def test_rejects_duplicate_bits(self):
         """Test that compose rejects duplicates in either qubits or clbits."""
         base = QuantumCircuit(5, 5)
@@ -911,6 +1023,55 @@ def test_rejects_duplicate_bits(self):
         with self.assertRaisesRegex(CircuitError, "Duplicate clbits"):
             base.compose(attempt, [0, 1], [1, 1])
 
+    def test_cannot_mix_inputs_and_captures(self):
+        """The rules about mixing `input` and `capture` vars should still apply."""
+        a = expr.Var.new("a", types.Bool())
+        b = expr.Var.new("b", types.Uint(8))
+        with self.assertRaisesRegex(CircuitError, "circuits with input variables cannot be"):
+            QuantumCircuit(inputs=[a]).compose(QuantumCircuit(captures=[b]))
+        with self.assertRaisesRegex(CircuitError, "circuits to be enclosed with captures cannot"):
+            QuantumCircuit(captures=[a]).compose(QuantumCircuit(inputs=[b]))
+
+    def test_reject_var_naming_collision(self):
+        """We can't have multiple vars with the same name."""
+        a1 = expr.Var.new("a", types.Bool())
+        a2 = expr.Var.new("a", types.Bool())
+        b = expr.Var.new("b", types.Bool())
+        self.assertNotEqual(a1, a2)
+
+        with self.assertRaisesRegex(CircuitError, "cannot add.*shadows"):
+            QuantumCircuit(inputs=[a1]).compose(QuantumCircuit(inputs=[a2]))
+        with self.assertRaisesRegex(CircuitError, "cannot add.*shadows"):
+            QuantumCircuit(captures=[a1]).compose(QuantumCircuit(declarations=[(a2, False)]))
+        with self.assertRaisesRegex(CircuitError, "cannot add.*shadows"):
+            QuantumCircuit(declarations=[(a1, True)]).compose(
+                QuantumCircuit(inputs=[b]), var_remap={b: a2}
+            )
+
+    def test_reject_remap_var_to_bad_type(self):
+        """Can't map a var to a different type."""
+        a = expr.Var.new("a", types.Bool())
+        b = expr.Var.new("b", types.Uint(8))
+        qc = QuantumCircuit(inputs=[a])
+        with self.assertRaisesRegex(CircuitError, "mismatched types"):
+            QuantumCircuit().compose(qc, var_remap={a: b})
+        qc = QuantumCircuit(captures=[b])
+        with self.assertRaisesRegex(CircuitError, "mismatched types"):
+            QuantumCircuit().compose(qc, var_remap={b: a})
+
+    def test_reject_inlining_missing_var(self):
+        """Can't inline a var that doesn't exist."""
+        a = expr.Var.new("a", types.Bool())
+        b = expr.Var.new("b", types.Bool())
+        qc = QuantumCircuit(captures=[a])
+        with self.assertRaisesRegex(CircuitError, "Variable '.*' to be inlined is not in the base"):
+            QuantumCircuit().compose(qc, inline_captures=True)
+
+        # 'a' _would_ be present, except we also say to remap it before attempting the inline.
+        qc = QuantumCircuit(captures=[a])
+        with self.assertRaisesRegex(CircuitError, "Replacement '.*' for variable '.*' is not in"):
+            QuantumCircuit(inputs=[a]).compose(qc, var_remap={a: b}, inline_captures=True)
+
 
 if __name__ == "__main__":
     unittest.main()
diff --git a/test/python/circuit/test_store.py b/test/python/circuit/test_store.py
index b44aac51f7a5..425eae55a4bf 100644
--- a/test/python/circuit/test_store.py
+++ b/test/python/circuit/test_store.py
@@ -133,6 +133,22 @@ def test_lifts_values(self):
         qc.store(b, 0xFFFF)
         self.assertEqual(qc.data[-1].operation, Store(b, expr.lift(0xFFFF)))
 
+    def test_lifts_integer_literals_to_full_width(self):
+        a = expr.Var.new("a", types.Uint(8))
+        qc = QuantumCircuit(inputs=[a])
+        qc.store(a, 1)
+        self.assertEqual(qc.data[-1].operation, Store(a, expr.Value(1, a.type)))
+        qc.store(a, 255)
+        self.assertEqual(qc.data[-1].operation, Store(a, expr.Value(255, a.type)))
+
+    def test_does_not_widen_bool_literal(self):
+        # `bool` is a subclass of `int` in Python (except some arithmetic operations have different
+        # semantics...).  It's not in Qiskit's value type system, though.
+        a = expr.Var.new("a", types.Uint(8))
+        qc = QuantumCircuit(captures=[a])
+        with self.assertRaisesRegex(CircuitError, "explicit cast is required"):
+            qc.store(a, True)
+
     def test_rejects_vars_not_in_circuit(self):
         a = expr.Var.new("a", types.Bool())
         b = expr.Var.new("b", types.Bool())
diff --git a/test/python/converters/test_circuit_to_dag.py b/test/python/converters/test_circuit_to_dag.py
index 4f2f52d03780..0bded9c0f4a2 100644
--- a/test/python/converters/test_circuit_to_dag.py
+++ b/test/python/converters/test_circuit_to_dag.py
@@ -15,9 +15,9 @@
 import unittest
 
 from qiskit.dagcircuit import DAGCircuit
-from qiskit.circuit import QuantumRegister, ClassicalRegister, QuantumCircuit, Clbit
+from qiskit.circuit import QuantumRegister, ClassicalRegister, QuantumCircuit, Clbit, SwitchCaseOp
 from qiskit.circuit.library import HGate, Measure
-from qiskit.circuit.classical import expr
+from qiskit.circuit.classical import expr, types
 from qiskit.converters import dag_to_circuit, circuit_to_dag
 from test import QiskitTestCase  # pylint: disable=wrong-import-order
 
@@ -106,6 +106,38 @@ def test_wires_from_expr_nodes_target(self):
         for original, test in zip(outer, roundtripped):
             self.assertEqual(original.operation.target, test.operation.target)
 
+    def test_runtime_vars_in_roundtrip(self):
+        """`expr.Var` nodes should be fully roundtripped."""
+        a = expr.Var.new("a", types.Bool())
+        b = expr.Var.new("b", types.Bool())
+        c = expr.Var.new("c", types.Uint(8))
+        d = expr.Var.new("d", types.Uint(8))
+        qc = QuantumCircuit(inputs=[a, c])
+        qc.add_var(b, False)
+        qc.add_var(d, 255)
+        qc.store(a, expr.logic_or(a, b))
+        with qc.if_test(expr.logic_and(a, expr.equal(c, d))):
+            pass
+        with qc.while_loop(a):
+            qc.store(a, expr.logic_or(a, b))
+        with qc.switch(d) as case:
+            with case(0):
+                qc.store(c, d)
+            with case(case.DEFAULT):
+                qc.store(a, False)
+
+        roundtrip = dag_to_circuit(circuit_to_dag(qc))
+        self.assertEqual(qc, roundtrip)
+
+        self.assertIsInstance(qc.data[-1].operation, SwitchCaseOp)
+        # This is guaranteed to be topologically last, even after the DAG roundtrip.
+        self.assertIsInstance(roundtrip.data[-1].operation, SwitchCaseOp)
+        self.assertEqual(qc.data[-1].operation.blocks, roundtrip.data[-1].operation.blocks)
+
+        blocks = roundtrip.data[-1].operation.blocks
+        self.assertEqual(set(blocks[0].iter_captured_vars()), {c, d})
+        self.assertEqual(set(blocks[1].iter_captured_vars()), {a})
+
     def test_wire_order(self):
         """Test that the `qubit_order` and `clbit_order` parameters are respected."""
         permutation = [2, 3, 1, 4, 0, 5]  # Arbitrary.
diff --git a/test/python/converters/test_circuit_to_gate.py b/test/python/converters/test_circuit_to_gate.py
index de3ad079e566..8e71a7f595a2 100644
--- a/test/python/converters/test_circuit_to_gate.py
+++ b/test/python/converters/test_circuit_to_gate.py
@@ -18,6 +18,7 @@
 
 from qiskit import QuantumRegister, QuantumCircuit
 from qiskit.circuit import Gate, Qubit
+from qiskit.circuit.classical import expr, types
 from qiskit.quantum_info import Operator
 from qiskit.exceptions import QiskitError
 from test import QiskitTestCase  # pylint: disable=wrong-import-order
@@ -122,3 +123,16 @@ def test_zero_operands(self):
         compound = QuantumCircuit(1)
         compound.append(gate, [], [])
         np.testing.assert_allclose(-np.eye(2), Operator(compound), atol=1e-16)
+
+    def test_realtime_vars_rejected(self):
+        """Gates can't have realtime variables."""
+        qc = QuantumCircuit(1, inputs=[expr.Var.new("a", types.Bool())])
+        with self.assertRaisesRegex(QiskitError, "circuits with realtime classical variables"):
+            qc.to_gate()
+        qc = QuantumCircuit(1, captures=[expr.Var.new("a", types.Bool())])
+        with self.assertRaisesRegex(QiskitError, "circuits with realtime classical variables"):
+            qc.to_gate()
+        qc = QuantumCircuit(1)
+        qc.add_var("a", False)
+        with self.assertRaisesRegex(QiskitError, "circuits with realtime classical variables"):
+            qc.to_gate()
diff --git a/test/python/converters/test_circuit_to_instruction.py b/test/python/converters/test_circuit_to_instruction.py
index 56a227dbad92..d4b69e71aa10 100644
--- a/test/python/converters/test_circuit_to_instruction.py
+++ b/test/python/converters/test_circuit_to_instruction.py
@@ -21,6 +21,7 @@
 from qiskit import QuantumRegister, ClassicalRegister, QuantumCircuit
 from qiskit.circuit import Qubit, Clbit, Instruction
 from qiskit.circuit import Parameter
+from qiskit.circuit.classical import expr, types
 from qiskit.quantum_info import Operator
 from qiskit.exceptions import QiskitError
 from test import QiskitTestCase  # pylint: disable=wrong-import-order
@@ -218,6 +219,38 @@ def test_zero_operands(self):
         compound.append(instruction, [], [])
         np.testing.assert_allclose(-np.eye(2), Operator(compound), atol=1e-16)
 
+    def test_forbids_captured_vars(self):
+        """Instructions (here an analogue of functions) cannot close over outer scopes."""
+        qc = QuantumCircuit(captures=[expr.Var.new("a", types.Bool())])
+        with self.assertRaisesRegex(QiskitError, "Circuits that capture variables cannot"):
+            qc.to_instruction()
+
+    def test_forbids_input_vars(self):
+        """This test can be relaxed when we have proper support for the behaviour.
+
+        This actually has a natural meaning; the input variables could become typed parameters.
+        We don't have a formal structure for managing that yet, though, so it's forbidden until the
+        library is ready for that."""
+        qc = QuantumCircuit(inputs=[expr.Var.new("a", types.Bool())])
+        with self.assertRaisesRegex(QiskitError, "Circuits with 'input' variables cannot"):
+            qc.to_instruction()
+
+    def test_forbids_declared_vars(self):
+        """This test can be relaxed when we have proper support for the behaviour.
+
+        This has a very natural representation, which needs basically zero special handling, since
+        the variables are necessarily entirely internal to the subroutine.  The reason it is
+        starting off as forbidden is because we don't have a good way to support variable renaming
+        during unrolling in transpilation, and we want the error to indicate an alternative at the
+        point the conversion happens."""
+        qc = QuantumCircuit()
+        qc.add_var("a", False)
+        with self.assertRaisesRegex(
+            QiskitError,
+            "Circuits with internal variables.*You may be able to use `QuantumCircuit.compose`",
+        ):
+            qc.to_instruction()
+
 
 if __name__ == "__main__":
     unittest.main(verbosity=2)
diff --git a/test/python/dagcircuit/test_compose.py b/test/python/dagcircuit/test_compose.py
index c2862eb200fe..ff5014eacef7 100644
--- a/test/python/dagcircuit/test_compose.py
+++ b/test/python/dagcircuit/test_compose.py
@@ -22,9 +22,10 @@
     WhileLoopOp,
     SwitchCaseOp,
     CASE_DEFAULT,
+    Store,
 )
 from qiskit.circuit.classical import expr, types
-from qiskit.dagcircuit import DAGCircuit
+from qiskit.dagcircuit import DAGCircuit, DAGCircuitError
 from qiskit.converters import circuit_to_dag, dag_to_circuit
 from qiskit.pulse import Schedule
 from qiskit.circuit.gate import Gate
@@ -540,6 +541,91 @@ def test_compose_expr_target(self):
 
         self.assertEqual(dest, circuit_to_dag(expected))
 
+    def test_join_unrelated_dags(self):
+        """This isn't expected to be common, but should work anyway."""
+        a = expr.Var.new("a", types.Bool())
+        b = expr.Var.new("b", types.Bool())
+        c = expr.Var.new("c", types.Uint(8))
+
+        dest = DAGCircuit()
+        dest.add_input_var(a)
+        dest.apply_operation_back(Store(a, expr.lift(False)), (), ())
+        source = DAGCircuit()
+        source.add_declared_var(b)
+        source.add_input_var(c)
+        source.apply_operation_back(Store(b, expr.lift(True)), (), ())
+        dest.compose(source)
+
+        expected = DAGCircuit()
+        expected.add_input_var(a)
+        expected.add_declared_var(b)
+        expected.add_input_var(c)
+        expected.apply_operation_back(Store(a, expr.lift(False)), (), ())
+        expected.apply_operation_back(Store(b, expr.lift(True)), (), ())
+
+        self.assertEqual(dest, expected)
+
+    def test_join_unrelated_dags_captures(self):
+        """This isn't expected to be common, but should work anyway."""
+        a = expr.Var.new("a", types.Bool())
+        b = expr.Var.new("b", types.Bool())
+        c = expr.Var.new("c", types.Uint(8))
+
+        dest = DAGCircuit()
+        dest.add_captured_var(a)
+        dest.apply_operation_back(Store(a, expr.lift(False)), (), ())
+        source = DAGCircuit()
+        source.add_declared_var(b)
+        source.add_captured_var(c)
+        source.apply_operation_back(Store(b, expr.lift(True)), (), ())
+        dest.compose(source, inline_captures=False)
+
+        expected = DAGCircuit()
+        expected.add_captured_var(a)
+        expected.add_declared_var(b)
+        expected.add_captured_var(c)
+        expected.apply_operation_back(Store(a, expr.lift(False)), (), ())
+        expected.apply_operation_back(Store(b, expr.lift(True)), (), ())
+
+        self.assertEqual(dest, expected)
+
+    def test_inline_capture_var(self):
+        """Should be able to append uses onto another DAG."""
+        a = expr.Var.new("a", types.Bool())
+        b = expr.Var.new("b", types.Bool())
+
+        dest = DAGCircuit()
+        dest.add_input_var(a)
+        dest.add_input_var(b)
+        dest.apply_operation_back(Store(a, expr.lift(False)), (), ())
+        source = DAGCircuit()
+        source.add_captured_var(b)
+        source.apply_operation_back(Store(b, expr.lift(True)), (), ())
+        dest.compose(source, inline_captures=True)
+
+        expected = DAGCircuit()
+        expected.add_input_var(a)
+        expected.add_input_var(b)
+        expected.apply_operation_back(Store(a, expr.lift(False)), (), ())
+        expected.apply_operation_back(Store(b, expr.lift(True)), (), ())
+
+        self.assertEqual(dest, expected)
+
+    def test_reject_inline_to_nonexistent_var(self):
+        """Should not be able to inline a variable that doesn't exist."""
+        a = expr.Var.new("a", types.Bool())
+        b = expr.Var.new("b", types.Bool())
+
+        dest = DAGCircuit()
+        dest.add_input_var(a)
+        dest.apply_operation_back(Store(a, expr.lift(False)), (), ())
+        source = DAGCircuit()
+        source.add_captured_var(b)
+        with self.assertRaisesRegex(
+            DAGCircuitError, "Variable '.*' to be inlined is not in the base DAG"
+        ):
+            dest.compose(source, inline_captures=True)
+
     def test_compose_calibrations(self):
         """Test that compose carries over the calibrations."""
         dag_cal = QuantumCircuit(1)
diff --git a/test/python/dagcircuit/test_dagcircuit.py b/test/python/dagcircuit/test_dagcircuit.py
index 3fcf5ff7a27e..14033e522c62 100644
--- a/test/python/dagcircuit/test_dagcircuit.py
+++ b/test/python/dagcircuit/test_dagcircuit.py
@@ -38,8 +38,10 @@
     SwitchCaseOp,
     IfElseOp,
     WhileLoopOp,
+    CASE_DEFAULT,
+    Store,
 )
-from qiskit.circuit.classical import expr
+from qiskit.circuit.classical import expr, types
 from qiskit.circuit.library import IGate, HGate, CXGate, CZGate, XGate, YGate, U1Gate, RXGate
 from qiskit.converters import circuit_to_dag
 from test import QiskitTestCase  # pylint: disable=wrong-import-order
@@ -421,6 +423,67 @@ def test_copy_empty_like(self):
         self.assertEqual(self.dag.duration, result_dag.duration)
         self.assertEqual(self.dag.unit, result_dag.unit)
 
+    def test_copy_empty_like_vars(self):
+        """Variables should be part of the empty copy."""
+        dag = DAGCircuit()
+        dag.add_input_var(expr.Var.new("a", types.Bool()))
+        dag.add_input_var(expr.Var.new("b", types.Uint(8)))
+        dag.add_declared_var(expr.Var.new("c", types.Bool()))
+        dag.add_declared_var(expr.Var.new("d", types.Uint(8)))
+        self.assertEqual(dag, dag.copy_empty_like())
+
+        dag = DAGCircuit()
+        dag.add_captured_var(expr.Var.new("a", types.Bool()))
+        dag.add_captured_var(expr.Var.new("b", types.Uint(8)))
+        dag.add_declared_var(expr.Var.new("c", types.Bool()))
+        dag.add_declared_var(expr.Var.new("d", types.Uint(8)))
+        self.assertEqual(dag, dag.copy_empty_like())
+
+    def test_copy_empty_like_vars_captures(self):
+        """Variables can be converted to captures as part of the empty copy."""
+        a = expr.Var.new("a", types.Bool())
+        b = expr.Var.new("b", types.Uint(8))
+        c = expr.Var.new("c", types.Bool())
+        d = expr.Var.new("d", types.Uint(8))
+        all_captures = DAGCircuit()
+        for var in [a, b, c, d]:
+            all_captures.add_captured_var(var)
+
+        dag = DAGCircuit()
+        dag.add_input_var(a)
+        dag.add_input_var(b)
+        dag.add_declared_var(c)
+        dag.add_declared_var(d)
+        self.assertEqual(all_captures, dag.copy_empty_like(vars_mode="captures"))
+
+        dag = DAGCircuit()
+        dag.add_captured_var(a)
+        dag.add_captured_var(b)
+        dag.add_declared_var(c)
+        dag.add_declared_var(d)
+        self.assertEqual(all_captures, dag.copy_empty_like(vars_mode="captures"))
+
+    def test_copy_empty_like_vars_drop(self):
+        """Variables can be dropped as part of the empty copy."""
+        a = expr.Var.new("a", types.Bool())
+        b = expr.Var.new("b", types.Uint(8))
+        c = expr.Var.new("c", types.Bool())
+        d = expr.Var.new("d", types.Uint(8))
+
+        dag = DAGCircuit()
+        dag.add_input_var(a)
+        dag.add_input_var(b)
+        dag.add_declared_var(c)
+        dag.add_declared_var(d)
+        self.assertEqual(DAGCircuit(), dag.copy_empty_like(vars_mode="drop"))
+
+        dag = DAGCircuit()
+        dag.add_captured_var(a)
+        dag.add_captured_var(b)
+        dag.add_declared_var(c)
+        dag.add_declared_var(d)
+        self.assertEqual(DAGCircuit(), dag.copy_empty_like(vars_mode="drop"))
+
     def test_remove_busy_clbit(self):
         """Classical bit removal of busy classical bits raises."""
         self.dag.apply_operation_back(Measure(), [self.qreg[0]], [self.individual_clbit])
@@ -1822,6 +1885,231 @@ def test_semantic_expr(self):
         qc2.switch(expr.bit_and(cr, 5), [(1, body)], [0], [])
         self.assertNotEqual(circuit_to_dag(qc1), circuit_to_dag(qc2))
 
+    def test_present_vars(self):
+        """The vars should be compared whether or not they're used."""
+        a_bool = expr.Var.new("a", types.Bool())
+        a_u8 = expr.Var.new("a", types.Uint(8))
+        a_u8_other = expr.Var.new("a", types.Uint(8))
+        b_bool = expr.Var.new("b", types.Bool())
+
+        left = DAGCircuit()
+        left.add_input_var(a_bool)
+        left.add_input_var(b_bool)
+        self.assertEqual(left.num_input_vars, 2)
+        self.assertEqual(left.num_captured_vars, 0)
+        self.assertEqual(left.num_declared_vars, 0)
+        self.assertEqual(left.num_vars, 2)
+
+        right = DAGCircuit()
+        right.add_input_var(a_bool)
+        right.add_input_var(b_bool)
+        self.assertEqual(right.num_input_vars, 2)
+        self.assertEqual(right.num_captured_vars, 0)
+        self.assertEqual(right.num_declared_vars, 0)
+        self.assertEqual(left.num_vars, 2)
+        self.assertEqual(left, right)
+
+        right = DAGCircuit()
+        right.add_input_var(a_u8)
+        right.add_input_var(b_bool)
+        self.assertEqual(right.num_input_vars, 2)
+        self.assertEqual(right.num_captured_vars, 0)
+        self.assertEqual(right.num_declared_vars, 0)
+        self.assertEqual(right.num_vars, 2)
+        self.assertNotEqual(left, right)
+
+        right = DAGCircuit()
+        self.assertEqual(right.num_input_vars, 0)
+        self.assertEqual(right.num_captured_vars, 0)
+        self.assertEqual(right.num_declared_vars, 0)
+        self.assertEqual(right.num_vars, 0)
+        self.assertNotEqual(left, right)
+
+        right = DAGCircuit()
+        right.add_captured_var(a_bool)
+        right.add_captured_var(b_bool)
+        self.assertEqual(right.num_input_vars, 0)
+        self.assertEqual(right.num_captured_vars, 2)
+        self.assertEqual(right.num_declared_vars, 0)
+        self.assertEqual(right.num_vars, 2)
+        self.assertNotEqual(left, right)
+
+        right = DAGCircuit()
+        right.add_declared_var(a_bool)
+        right.add_declared_var(b_bool)
+        self.assertEqual(right.num_input_vars, 0)
+        self.assertEqual(right.num_captured_vars, 0)
+        self.assertEqual(right.num_declared_vars, 2)
+        self.assertEqual(right.num_vars, 2)
+        self.assertNotEqual(left, right)
+
+        left = DAGCircuit()
+        left.add_captured_var(a_u8)
+
+        right = DAGCircuit()
+        right.add_captured_var(a_u8)
+        self.assertEqual(left, right)
+
+        right = DAGCircuit()
+        right.add_captured_var(a_u8_other)
+        self.assertNotEqual(left, right)
+
+    def test_wires_added_for_simple_classical_vars(self):
+        """Var uses should be represented in the wire structure."""
+        a = expr.Var.new("a", types.Bool())
+        dag = DAGCircuit()
+        dag.add_input_var(a)
+        self.assertEqual(list(dag.iter_vars()), [a])
+        self.assertEqual(list(dag.iter_input_vars()), [a])
+        self.assertEqual(list(dag.iter_captured_vars()), [])
+        self.assertEqual(list(dag.iter_declared_vars()), [])
+
+        expected_nodes = [dag.input_map[a], dag.output_map[a]]
+        self.assertEqual(list(dag.topological_nodes()), expected_nodes)
+        self.assertTrue(dag.is_successor(dag.input_map[a], dag.output_map[a]))
+
+        op_mid = dag.apply_operation_back(Store(a, expr.lift(True)), (), ())
+        self.assertTrue(dag.is_successor(dag.input_map[a], op_mid))
+        self.assertTrue(dag.is_successor(op_mid, dag.output_map[a]))
+        self.assertFalse(dag.is_successor(dag.input_map[a], dag.output_map[a]))
+
+        op_front = dag.apply_operation_front(Store(a, expr.logic_not(a)), (), ())
+        self.assertTrue(dag.is_successor(dag.input_map[a], op_front))
+        self.assertTrue(dag.is_successor(op_front, op_mid))
+        self.assertFalse(dag.is_successor(dag.input_map[a], op_mid))
+
+        op_back = dag.apply_operation_back(Store(a, expr.logic_not(a)), (), ())
+        self.assertTrue(dag.is_successor(op_mid, op_back))
+        self.assertTrue(dag.is_successor(op_back, dag.output_map[a]))
+        self.assertFalse(dag.is_successor(op_mid, dag.output_map[a]))
+
+    def test_wires_added_for_var_control_flow_condition(self):
+        """Vars used in if/else or while conditionals should be added to the wire structure."""
+        a = expr.Var.new("a", types.Bool())
+        b = expr.Var.new("b", types.Bool())
+        dag = DAGCircuit()
+        dag.add_declared_var(a)
+        dag.add_input_var(b)
+
+        op_store = dag.apply_operation_back(Store(a, expr.lift(False)), (), ())
+        op_if = dag.apply_operation_back(IfElseOp(a, QuantumCircuit()), (), ())
+        op_while = dag.apply_operation_back(
+            WhileLoopOp(expr.logic_or(a, b), QuantumCircuit()), (), ()
+        )
+
+        expected_edges = {
+            (dag.input_map[a], op_store, a),
+            (op_store, op_if, a),
+            (op_if, op_while, a),
+            (op_while, dag.output_map[a], a),
+            (dag.input_map[b], op_while, b),
+            (op_while, dag.output_map[b], b),
+        }
+        self.assertEqual(set(dag.edges()), expected_edges)
+
+    def test_wires_added_for_var_control_flow_target(self):
+        """Vars used in switch targets should be added to the wire structure."""
+        a = expr.Var.new("a", types.Uint(8))
+        b = expr.Var.new("b", types.Uint(8))
+        dag = DAGCircuit()
+        dag.add_declared_var(a)
+        dag.add_input_var(b)
+
+        op_store = dag.apply_operation_back(Store(a, expr.lift(3, a.type)), (), ())
+        op_switch = dag.apply_operation_back(
+            SwitchCaseOp(expr.bit_xor(a, b), [(CASE_DEFAULT, QuantumCircuit())]), (), ()
+        )
+
+        expected_edges = {
+            (dag.input_map[a], op_store, a),
+            (op_store, op_switch, a),
+            (op_switch, dag.output_map[a], a),
+            (dag.input_map[b], op_switch, b),
+            (op_switch, dag.output_map[b], b),
+        }
+        self.assertEqual(set(dag.edges()), expected_edges)
+
+    def test_wires_added_for_control_flow_captures(self):
+        """Vars captured in control-flow blocks should be in the wire structure."""
+        a = expr.Var.new("a", types.Bool())
+        b = expr.Var.new("b", types.Bool())
+        c = expr.Var.new("c", types.Bool())
+        d = expr.Var.new("d", types.Uint(8))
+        dag = DAGCircuit()
+        dag.add_input_var(a)
+        dag.add_input_var(b)
+        dag.add_declared_var(c)
+        dag.add_input_var(d)
+        op_store = dag.apply_operation_back(Store(c, expr.lift(False)), (), ())
+        op_if = dag.apply_operation_back(IfElseOp(a, QuantumCircuit(captures=[b])), (), ())
+        op_switch = dag.apply_operation_back(
+            SwitchCaseOp(
+                d,
+                [
+                    (0, QuantumCircuit(captures=[b])),
+                    (CASE_DEFAULT, QuantumCircuit(captures=[c])),
+                ],
+            ),
+            (),
+            (),
+        )
+
+        expected_edges = {
+            # a
+            (dag.input_map[a], op_if, a),
+            (op_if, dag.output_map[a], a),
+            # b
+            (dag.input_map[b], op_if, b),
+            (op_if, op_switch, b),
+            (op_switch, dag.output_map[b], b),
+            # c
+            (dag.input_map[c], op_store, c),
+            (op_store, op_switch, c),
+            (op_switch, dag.output_map[c], c),
+            # d
+            (dag.input_map[d], op_switch, d),
+            (op_switch, dag.output_map[d], d),
+        }
+        self.assertEqual(set(dag.edges()), expected_edges)
+
+    def test_forbid_mixing_captures_inputs(self):
+        """Test that a DAG can't have both captures and inputs."""
+        a = expr.Var.new("a", types.Bool())
+        b = expr.Var.new("b", types.Bool())
+
+        dag = DAGCircuit()
+        dag.add_input_var(a)
+        with self.assertRaisesRegex(
+            DAGCircuitError, "cannot add captures to a circuit with inputs"
+        ):
+            dag.add_captured_var(b)
+
+        dag = DAGCircuit()
+        dag.add_captured_var(a)
+        with self.assertRaisesRegex(
+            DAGCircuitError, "cannot add inputs to a circuit with captures"
+        ):
+            dag.add_input_var(b)
+
+    def test_forbid_adding_nonstandalone_var(self):
+        """Temporary "wrapping" vars aren't standalone and can't be tracked separately."""
+        dag = DAGCircuit()
+        with self.assertRaisesRegex(DAGCircuitError, "cannot add variables that wrap"):
+            dag.add_input_var(expr.lift(ClassicalRegister(4, "c")))
+        with self.assertRaisesRegex(DAGCircuitError, "cannot add variables that wrap"):
+            dag.add_declared_var(expr.lift(Clbit()))
+
+    def test_forbid_adding_conflicting_vars(self):
+        """Can't re-add a variable that exists, nor a shadowing variable in the same scope."""
+        a1 = expr.Var.new("a", types.Bool())
+        a2 = expr.Var.new("a", types.Bool())
+        dag = DAGCircuit()
+        dag.add_declared_var(a1)
+        with self.assertRaisesRegex(DAGCircuitError, "already present in the circuit"):
+            dag.add_declared_var(a1)
+        with self.assertRaisesRegex(DAGCircuitError, "cannot add .* as its name shadows"):
+            dag.add_declared_var(a2)
+
 
 class TestDagSubstitute(QiskitTestCase):
     """Test substituting a dag node with a sub-dag"""
@@ -2012,6 +2300,125 @@ def test_substitute_dag_switch_expr(self):
 
         self.assertEqual(src, expected)
 
+    def test_substitute_dag_vars(self):
+        """Should be possible to replace a node with a DAG acting on the same wires."""
+        a = expr.Var.new("a", types.Bool())
+        b = expr.Var.new("b", types.Bool())
+        c = expr.Var.new("c", types.Bool())
+
+        dag = DAGCircuit()
+        dag.add_input_var(a)
+        dag.add_input_var(b)
+        dag.add_input_var(c)
+        dag.apply_operation_back(Store(c, expr.lift(False)), (), ())
+        node = dag.apply_operation_back(Store(a, expr.logic_or(expr.logic_or(a, b), c)), (), ())
+        dag.apply_operation_back(Store(b, expr.lift(True)), (), ())
+
+        replace = DAGCircuit()
+        replace.add_captured_var(a)
+        replace.add_captured_var(b)
+        replace.add_captured_var(c)
+        replace.apply_operation_back(Store(a, expr.logic_or(a, b)), (), ())
+        replace.apply_operation_back(Store(a, expr.logic_or(a, c)), (), ())
+
+        expected = DAGCircuit()
+        expected.add_input_var(a)
+        expected.add_input_var(b)
+        expected.add_input_var(c)
+        expected.apply_operation_back(Store(c, expr.lift(False)), (), ())
+        expected.apply_operation_back(Store(a, expr.logic_or(a, b)), (), ())
+        expected.apply_operation_back(Store(a, expr.logic_or(a, c)), (), ())
+        expected.apply_operation_back(Store(b, expr.lift(True)), (), ())
+
+        dag.substitute_node_with_dag(node, replace, wires={})
+
+        self.assertEqual(dag, expected)
+
+    def test_substitute_dag_if_else_expr_var(self):
+        """Test that substitution works with if/else ops with standalone Vars."""
+        a = expr.Var.new("a", types.Bool())
+        b = expr.Var.new("b", types.Bool())
+
+        body_rep = QuantumCircuit(1)
+        body_rep.z(0)
+
+        q_rep = QuantumRegister(1)
+        c_rep = ClassicalRegister(2)
+        replacement = DAGCircuit()
+        replacement.add_qreg(q_rep)
+        replacement.add_creg(c_rep)
+        replacement.add_captured_var(b)
+        replacement.apply_operation_back(XGate(), [q_rep[0]], [])
+        replacement.apply_operation_back(
+            IfElseOp(expr.logic_and(b, expr.equal(c_rep, 1)), body_rep, None), [q_rep[0]], []
+        )
+
+        true_src = QuantumCircuit(1)
+        true_src.x(0)
+        true_src.z(0)
+        false_src = QuantumCircuit(1)
+        false_src.x(0)
+        q_src = QuantumRegister(4)
+        c1_src = ClassicalRegister(2)
+        c2_src = ClassicalRegister(2)
+        src = DAGCircuit()
+        src.add_qreg(q_src)
+        src.add_creg(c1_src)
+        src.add_creg(c2_src)
+        src.add_input_var(a)
+        src.add_input_var(b)
+        node = src.apply_operation_back(
+            IfElseOp(expr.logic_and(b, expr.equal(c1_src, 1)), true_src, false_src), [q_src[2]], []
+        )
+
+        wires = {q_rep[0]: q_src[2], c_rep[0]: c1_src[0], c_rep[1]: c1_src[1]}
+        src.substitute_node_with_dag(node, replacement, wires=wires)
+
+        expected = DAGCircuit()
+        expected.add_qreg(q_src)
+        expected.add_creg(c1_src)
+        expected.add_creg(c2_src)
+        expected.add_input_var(a)
+        expected.add_input_var(b)
+        expected.apply_operation_back(XGate(), [q_src[2]], [])
+        expected.apply_operation_back(
+            IfElseOp(expr.logic_and(b, expr.equal(c1_src, 1)), body_rep, None), [q_src[2]], []
+        )
+
+        self.assertEqual(src, expected)
+
+    def test_contract_var_use_to_nothing(self):
+        """The replacement DAG can drop wires."""
+        a = expr.Var.new("a", types.Bool())
+
+        src = DAGCircuit()
+        src.add_input_var(a)
+        node = src.apply_operation_back(Store(a, a), (), ())
+        replace = DAGCircuit()
+        src.substitute_node_with_dag(node, replace, {})
+
+        expected = DAGCircuit()
+        expected.add_input_var(a)
+
+        self.assertEqual(src, expected)
+
+    def test_raise_if_var_mismatch(self):
+        """The DAG can't add more wires."""
+        a = expr.Var.new("a", types.Bool())
+        b = expr.Var.new("b", types.Bool())
+
+        src = DAGCircuit()
+        src.add_input_var(a)
+        node = src.apply_operation_back(Store(a, a), (), ())
+
+        replace = DAGCircuit()
+        replace.add_input_var(a)
+        replace.add_input_var(b)
+        replace.apply_operation_back(Store(a, b), (), ())
+
+        with self.assertRaisesRegex(DAGCircuitError, "Cannot replace a node with a DAG with more"):
+            src.substitute_node_with_dag(node, replace, wires={})
+
     def test_raise_if_substituting_dag_modifies_its_conditional(self):
         """Verify that we raise if the input dag modifies any of the bits in node.op.condition."""
 
@@ -2402,6 +2809,55 @@ def test_reject_replace_switch_with_other_resources(self, inplace):
                 node, SwitchCaseOp(expr.lift(cr2), [((1, 3), case.copy())]), inplace=inplace
             )
 
+    @data(True, False)
+    def test_replace_switch_case_standalone_var(self, inplace):
+        """Replace a standalone-Var switch/case with another."""
+        a = expr.Var.new("a", types.Uint(8))
+        b = expr.Var.new("b", types.Uint(8))
+
+        case = QuantumCircuit(1)
+        case.x(0)
+
+        qr = QuantumRegister(1)
+        dag = DAGCircuit()
+        dag.add_qreg(qr)
+        dag.add_input_var(a)
+        dag.add_input_var(b)
+        node = dag.apply_operation_back(SwitchCaseOp(a, [((1, 3), case.copy())]), qr, [])
+        dag.substitute_node(
+            node, SwitchCaseOp(expr.bit_and(a, 1), [(1, case.copy())]), inplace=inplace
+        )
+
+        expected = DAGCircuit()
+        expected.add_qreg(qr)
+        expected.add_input_var(a)
+        expected.add_input_var(b)
+        expected.apply_operation_back(SwitchCaseOp(expr.bit_and(a, 1), [(1, case.copy())]), qr, [])
+
+        self.assertEqual(dag, expected)
+
+    @data(True, False)
+    def test_replace_store_standalone_var(self, inplace):
+        """Replace a standalone-Var Store with another."""
+        a = expr.Var.new("a", types.Bool())
+        b = expr.Var.new("b", types.Bool())
+
+        qr = QuantumRegister(1)
+        dag = DAGCircuit()
+        dag.add_qreg(qr)
+        dag.add_input_var(a)
+        dag.add_input_var(b)
+        node = dag.apply_operation_back(Store(a, a), (), ())
+        dag.substitute_node(node, Store(a, expr.logic_not(a)), inplace=inplace)
+
+        expected = DAGCircuit()
+        expected.add_qreg(qr)
+        expected.add_input_var(a)
+        expected.add_input_var(b)
+        expected.apply_operation_back(Store(a, expr.logic_not(a)), (), ())
+
+        self.assertEqual(dag, expected)
+
 
 class TestReplaceBlock(QiskitTestCase):
     """Test replacing a block of nodes in a DAG."""
@@ -2486,6 +2942,34 @@ def test_replace_control_flow_block(self):
 
         self.assertEqual(dag, expected)
 
+    def test_contract_stores(self):
+        """Test that contraction over nodes with `Var` wires works."""
+        a = expr.Var.new("a", types.Bool())
+        b = expr.Var.new("b", types.Bool())
+        c = expr.Var.new("c", types.Bool())
+
+        dag = DAGCircuit()
+        dag.add_input_var(a)
+        dag.add_input_var(b)
+        dag.add_input_var(c)
+        dag.apply_operation_back(Store(c, expr.lift(False)), (), ())
+        nodes = [
+            dag.apply_operation_back(Store(a, expr.logic_or(a, b)), (), ()),
+            dag.apply_operation_back(Store(a, expr.logic_or(a, c)), (), ()),
+        ]
+        dag.apply_operation_back(Store(b, expr.lift(True)), (), ())
+        dag.replace_block_with_op(nodes, Store(a, expr.logic_or(expr.logic_or(a, b), c)), {})
+
+        expected = DAGCircuit()
+        expected.add_input_var(a)
+        expected.add_input_var(b)
+        expected.add_input_var(c)
+        expected.apply_operation_back(Store(c, expr.lift(False)), (), ())
+        expected.apply_operation_back(Store(a, expr.logic_or(expr.logic_or(a, b), c)), (), ())
+        expected.apply_operation_back(Store(b, expr.lift(True)), (), ())
+
+        self.assertEqual(dag, expected)
+
 
 class TestDagProperties(QiskitTestCase):
     """Test the DAG properties."""
diff --git a/test/python/primitives/test_backend_estimator_v2.py b/test/python/primitives/test_backend_estimator_v2.py
index 570b117af583..6728d57e3fdd 100644
--- a/test/python/primitives/test_backend_estimator_v2.py
+++ b/test/python/primitives/test_backend_estimator_v2.py
@@ -16,6 +16,7 @@
 
 import unittest
 from test import QiskitTestCase, combine
+from unittest.mock import patch
 
 import numpy as np
 from ddt import ddt
@@ -28,7 +29,7 @@
 from qiskit.primitives.containers.observables_array import ObservablesArray
 from qiskit.providers.backend_compat import BackendV2Converter
 from qiskit.providers.basic_provider import BasicSimulator
-from qiskit.providers.fake_provider import Fake7QPulseV1
+from qiskit.providers.fake_provider import Fake7QPulseV1, GenericBackendV2
 from qiskit.quantum_info import SparsePauliOp
 from qiskit.transpiler.preset_passmanagers import generate_preset_pass_manager
 from qiskit.utils import optionals
@@ -371,6 +372,22 @@ def test_precision(self, backend, abelian_grouping):
         result = job.result()
         np.testing.assert_allclose(result[0].data.evs, [1.5555572817900956], rtol=self._rtol)
 
+    @combine(backend=BACKENDS, abelian_grouping=[True, False])
+    def test_diff_precision(self, backend, abelian_grouping):
+        """Test for running different precisions at once"""
+        estimator = BackendEstimatorV2(backend=backend, options=self._options)
+        estimator.options.abelian_grouping = abelian_grouping
+        pm = generate_preset_pass_manager(optimization_level=0, backend=backend)
+        psi1 = pm.run(self.psi[0])
+        hamiltonian1 = self.hamiltonian[0].apply_layout(psi1.layout)
+        theta1 = self.theta[0]
+        job = estimator.run(
+            [(psi1, hamiltonian1, [theta1]), (psi1, hamiltonian1, [theta1], self._precision * 0.8)]
+        )
+        result = job.result()
+        np.testing.assert_allclose(result[0].data.evs, [1.901141473854881], rtol=self._rtol)
+        np.testing.assert_allclose(result[1].data.evs, [1.901141473854881], rtol=self._rtol)
+
     @unittest.skipUnless(optionals.HAS_AER, "qiskit-aer is required to run this test")
     @combine(abelian_grouping=[True, False])
     def test_aer(self, abelian_grouping):
@@ -407,6 +424,55 @@ def test_aer(self, abelian_grouping):
                 result[0].data.evs, target[0].data.evs, rtol=self._rtol, atol=1e-1
             )
 
+    def test_job_size_limit_backend_v2(self):
+        """Test BackendEstimatorV2 respects job size limit"""
+
+        class FakeBackendLimitedCircuits(GenericBackendV2):
+            """Generic backend V2 with job size limit."""
+
+            @property
+            def max_circuits(self):
+                return 1
+
+        backend = FakeBackendLimitedCircuits(num_qubits=5)
+        qc = RealAmplitudes(num_qubits=2, reps=2)
+        # Note: two qubit-wise commuting groups
+        op = SparsePauliOp.from_list([("IZ", 1), ("XI", 2), ("ZY", -1)])
+        k = 5
+        param_list = self._rng.random(qc.num_parameters).tolist()
+        estimator = BackendEstimatorV2(backend=backend)
+        with patch.object(backend, "run") as run_mock:
+            estimator.run([(qc, op, param_list)] * k).result()
+        self.assertEqual(run_mock.call_count, 10)
+
+    def test_job_size_limit_backend_v1(self):
+        """Test BackendEstimatorV2 respects job size limit"""
+        backend = Fake7QPulseV1()
+        config = backend.configuration()
+        config.max_experiments = 1
+        backend._configuration = config
+        qc = RealAmplitudes(num_qubits=2, reps=2)
+        # Note: two qubit-wise commuting groups
+        op = SparsePauliOp.from_list([("IZ", 1), ("XI", 2), ("ZY", -1)])
+        k = 5
+        param_list = self._rng.random(qc.num_parameters).tolist()
+        estimator = BackendEstimatorV2(backend=backend)
+        with patch.object(backend, "run") as run_mock:
+            estimator.run([(qc, op, param_list)] * k).result()
+        self.assertEqual(run_mock.call_count, 10)
+
+    def test_iter_pub(self):
+        """test for an iterable of pubs"""
+        backend = BasicSimulator()
+        circuit = self.ansatz.assign_parameters([0, 1, 1, 2, 3, 5])
+        pm = generate_preset_pass_manager(optimization_level=0, backend=backend)
+        circuit = pm.run(circuit)
+        estimator = BackendEstimatorV2(backend=backend, options=self._options)
+        observable = self.observable.apply_layout(circuit.layout)
+        result = estimator.run(iter([(circuit, observable), (circuit, observable)])).result()
+        np.testing.assert_allclose(result[0].data.evs, [-1.284366511861733], rtol=self._rtol)
+        np.testing.assert_allclose(result[1].data.evs, [-1.284366511861733], rtol=self._rtol)
+
 
 if __name__ == "__main__":
     unittest.main()
diff --git a/test/python/primitives/test_backend_sampler_v2.py b/test/python/primitives/test_backend_sampler_v2.py
index dcddbf1126ad..dd58920689a3 100644
--- a/test/python/primitives/test_backend_sampler_v2.py
+++ b/test/python/primitives/test_backend_sampler_v2.py
@@ -33,7 +33,7 @@
 from qiskit.providers import JobStatus
 from qiskit.providers.backend_compat import BackendV2Converter
 from qiskit.providers.basic_provider import BasicSimulator
-from qiskit.providers.fake_provider import Fake7QPulseV1
+from qiskit.providers.fake_provider import Fake7QPulseV1, GenericBackendV2
 from qiskit.transpiler.preset_passmanagers import generate_preset_pass_manager
 
 BACKENDS = [BasicSimulator(), Fake7QPulseV1(), BackendV2Converter(Fake7QPulseV1())]
@@ -671,6 +671,85 @@ def test_empty_creg(self, backend):
         result = sampler.run([qc], shots=self._shots).result()
         self.assertEqual(result[0].data.c1.array.shape, (self._shots, 0))
 
+    @combine(backend=BACKENDS)
+    def test_diff_shots(self, backend):
+        """Test of pubs with different shots"""
+        pm = generate_preset_pass_manager(optimization_level=0, backend=backend)
+
+        bell, _, target = self._cases[1]
+        bell = pm.run(bell)
+        sampler = BackendSamplerV2(backend=backend, options=self._options)
+        shots2 = self._shots + 2
+        target2 = {k: v + 1 for k, v in target.items()}
+        job = sampler.run([(bell, None, self._shots), (bell, None, shots2)])
+        result = job.result()
+        self.assertEqual(len(result), 2)
+        self.assertEqual(result[0].data.meas.num_shots, self._shots)
+        self._assert_allclose(result[0].data.meas, np.array(target))
+        self.assertEqual(result[1].data.meas.num_shots, shots2)
+        self._assert_allclose(result[1].data.meas, np.array(target2))
+
+    def test_job_size_limit_backend_v2(self):
+        """Test BackendSamplerV2 respects backend's job size limit."""
+
+        class FakeBackendLimitedCircuits(GenericBackendV2):
+            """Generic backend V2 with job size limit."""
+
+            @property
+            def max_circuits(self):
+                return 1
+
+        qc = QuantumCircuit(1)
+        qc.measure_all()
+        qc2 = QuantumCircuit(1)
+        qc2.x(0)
+        qc2.measure_all()
+        sampler = BackendSamplerV2(backend=FakeBackendLimitedCircuits(num_qubits=5))
+        result = sampler.run([qc, qc2], shots=self._shots).result()
+        self.assertIsInstance(result, PrimitiveResult)
+        self.assertEqual(len(result), 2)
+        self.assertIsInstance(result[0], PubResult)
+        self.assertIsInstance(result[1], PubResult)
+        self._assert_allclose(result[0].data.meas, np.array({0: self._shots}))
+        self._assert_allclose(result[1].data.meas, np.array({1: self._shots}))
+
+    def test_job_size_limit_backend_v1(self):
+        """Test BackendSamplerV2 respects backend's job size limit."""
+        backend = Fake7QPulseV1()
+        config = backend.configuration()
+        config.max_experiments = 1
+        backend._configuration = config
+        qc = QuantumCircuit(1)
+        qc.measure_all()
+        qc2 = QuantumCircuit(1)
+        qc2.x(0)
+        qc2.measure_all()
+        sampler = BackendSamplerV2(backend=backend)
+        result = sampler.run([qc, qc2], shots=self._shots).result()
+        self.assertIsInstance(result, PrimitiveResult)
+        self.assertEqual(len(result), 2)
+        self.assertIsInstance(result[0], PubResult)
+        self.assertIsInstance(result[1], PubResult)
+        self._assert_allclose(result[0].data.meas, np.array({0: self._shots}))
+        self._assert_allclose(result[1].data.meas, np.array({1: self._shots}))
+
+    def test_iter_pub(self):
+        """Test of an iterable of pubs"""
+        backend = BasicSimulator()
+        qc = QuantumCircuit(1)
+        qc.measure_all()
+        qc2 = QuantumCircuit(1)
+        qc2.x(0)
+        qc2.measure_all()
+        sampler = BackendSamplerV2(backend=backend)
+        result = sampler.run(iter([qc, qc2]), shots=self._shots).result()
+        self.assertIsInstance(result, PrimitiveResult)
+        self.assertEqual(len(result), 2)
+        self.assertIsInstance(result[0], PubResult)
+        self.assertIsInstance(result[1], PubResult)
+        self._assert_allclose(result[0].data.meas, np.array({0: self._shots}))
+        self._assert_allclose(result[1].data.meas, np.array({1: self._shots}))
+
 
 if __name__ == "__main__":
     unittest.main()
diff --git a/test/python/primitives/test_statevector_estimator.py b/test/python/primitives/test_statevector_estimator.py
index 15c022f770cd..117ead6717ad 100644
--- a/test/python/primitives/test_statevector_estimator.py
+++ b/test/python/primitives/test_statevector_estimator.py
@@ -281,6 +281,15 @@ def test_precision_seed(self):
         result = job.result()
         np.testing.assert_allclose(result[0].data.evs, [1.5555572817900956])
 
+    def test_iter_pub(self):
+        """test for an iterable of pubs"""
+        estimator = StatevectorEstimator()
+        circuit = self.ansatz.assign_parameters([0, 1, 1, 2, 3, 5])
+        observable = self.observable.apply_layout(circuit.layout)
+        result = estimator.run(iter([(circuit, observable), (circuit, observable)])).result()
+        np.testing.assert_allclose(result[0].data.evs, [-1.284366511861733])
+        np.testing.assert_allclose(result[1].data.evs, [-1.284366511861733])
+
 
 if __name__ == "__main__":
     unittest.main()
diff --git a/test/python/primitives/test_statevector_sampler.py b/test/python/primitives/test_statevector_sampler.py
index cd0622b18de7..1a8ed0402e58 100644
--- a/test/python/primitives/test_statevector_sampler.py
+++ b/test/python/primitives/test_statevector_sampler.py
@@ -621,6 +621,22 @@ def test_no_cregs(self):
         self.assertEqual(len(result), 1)
         self.assertEqual(len(result[0].data), 0)
 
+    def test_iter_pub(self):
+        """Test of an iterable of pubs"""
+        qc = QuantumCircuit(1)
+        qc.measure_all()
+        qc2 = QuantumCircuit(1)
+        qc2.x(0)
+        qc2.measure_all()
+        sampler = StatevectorSampler()
+        result = sampler.run(iter([qc, qc2]), shots=self._shots).result()
+        self.assertIsInstance(result, PrimitiveResult)
+        self.assertEqual(len(result), 2)
+        self.assertIsInstance(result[0], PubResult)
+        self.assertIsInstance(result[1], PubResult)
+        self._assert_allclose(result[0].data.meas, np.array({0: self._shots}))
+        self._assert_allclose(result[1].data.meas, np.array({1: self._shots}))
+
 
 if __name__ == "__main__":
     unittest.main()
diff --git a/test/python/providers/basic_provider/test_standard_library.py b/test/python/providers/basic_provider/test_standard_library.py
new file mode 100644
index 000000000000..3d6b5c83ccc8
--- /dev/null
+++ b/test/python/providers/basic_provider/test_standard_library.py
@@ -0,0 +1,531 @@
+# This code is part of Qiskit.
+#
+# (C) Copyright IBM 2017, 2024.
+#
+# This code is licensed under the Apache License, Version 2.0. You may
+# obtain a copy of this license in the LICENSE.txt file in the root directory
+# of this source tree or at http://www.apache.org/licenses/LICENSE-2.0.
+#
+# Any modifications or derivative works of this code must retain this
+# copyright notice, and modified files need to carry a notice indicating
+# that they have been altered from the originals.
+
+# pylint: disable=missing-function-docstring, missing-module-docstring
+
+import unittest
+
+from qiskit import QuantumCircuit
+from qiskit.providers.basic_provider import BasicSimulator
+import qiskit.circuit.library.standard_gates as lib
+from test import QiskitTestCase  # pylint: disable=wrong-import-order
+
+
+class TestStandardGates(QiskitTestCase):
+    """Standard gates support in BasicSimulator, up to 3 qubits"""
+
+    def setUp(self):
+        super().setUp()
+        self.seed = 43
+        self.shots = 1
+        self.circuit = QuantumCircuit(4)
+
+    def test_barrier(self):
+        self.circuit.barrier(0, 1)
+        self.circuit.measure_all()
+        result = (
+            BasicSimulator().run(self.circuit, shots=self.shots, seed_simulator=self.seed).result()
+        )
+        self.assertEqual(result.success, True)
+
+    def test_barrier_none(self):
+        self.circuit.barrier()
+        self.circuit.measure_all()
+        result = (
+            BasicSimulator().run(self.circuit, shots=self.shots, seed_simulator=self.seed).result()
+        )
+        self.assertEqual(result.success, True)
+
+    def test_unitary(self):
+        matrix = [[0, 0, 0, 1], [0, 0, 1, 0], [1, 0, 0, 0], [0, 1, 0, 0]]
+        self.circuit.unitary(matrix, [0, 1])
+        self.circuit.measure_all()
+        result = (
+            BasicSimulator().run(self.circuit, shots=self.shots, seed_simulator=self.seed).result()
+        )
+        self.assertEqual(result.success, True)
+
+    def test_u(self):
+        self.circuit.u(0.5, 1.5, 1.5, 0)
+        self.circuit.measure_all()
+        result = (
+            BasicSimulator().run(self.circuit, shots=self.shots, seed_simulator=self.seed).result()
+        )
+        self.assertEqual(result.success, True)
+
+    def test_u1(self):
+        self.circuit.append(lib.U1Gate(0.5), [1])
+        self.circuit.measure_all()
+        result = (
+            BasicSimulator().run(self.circuit, shots=self.shots, seed_simulator=self.seed).result()
+        )
+        self.assertEqual(result.success, True)
+
+    def test_u2(self):
+        self.circuit.append(lib.U2Gate(0.5, 0.5), [1])
+        self.circuit.measure_all()
+        result = (
+            BasicSimulator().run(self.circuit, shots=self.shots, seed_simulator=self.seed).result()
+        )
+        self.assertEqual(result.success, True)
+
+    def test_u3(self):
+        self.circuit.append(lib.U3Gate(0.5, 0.5, 0.5), [1])
+        self.circuit.measure_all()
+        result = (
+            BasicSimulator().run(self.circuit, shots=self.shots, seed_simulator=self.seed).result()
+        )
+        self.assertEqual(result.success, True)
+
+    def test_ccx(self):
+        self.circuit.ccx(0, 1, 2)
+        self.circuit.measure_all()
+        result = (
+            BasicSimulator().run(self.circuit, shots=self.shots, seed_simulator=self.seed).result()
+        )
+        self.assertEqual(result.success, True)
+
+    def test_ccz(self):
+        self.circuit.ccz(0, 1, 2)
+        self.circuit.measure_all()
+        result = (
+            BasicSimulator().run(self.circuit, shots=self.shots, seed_simulator=self.seed).result()
+        )
+        self.assertEqual(result.success, True)
+
+    def test_ch(self):
+        self.circuit.ch(0, 1)
+        self.circuit.measure_all()
+        result = (
+            BasicSimulator().run(self.circuit, shots=self.shots, seed_simulator=self.seed).result()
+        )
+        self.assertEqual(result.success, True)
+
+    def test_cp(self):
+        self.circuit.cp(0, 0, 1)
+        self.circuit.measure_all()
+        result = (
+            BasicSimulator().run(self.circuit, shots=self.shots, seed_simulator=self.seed).result()
+        )
+        self.assertEqual(result.success, True)
+
+    def test_crx(self):
+        self.circuit.crx(1, 0, 1)
+        self.circuit.measure_all()
+        result = (
+            BasicSimulator().run(self.circuit, shots=self.shots, seed_simulator=self.seed).result()
+        )
+        self.assertEqual(result.success, True)
+
+    def test_cry(self):
+        self.circuit.cry(1, 0, 1)
+        self.circuit.measure_all()
+        result = (
+            BasicSimulator().run(self.circuit, shots=self.shots, seed_simulator=self.seed).result()
+        )
+        self.assertEqual(result.success, True)
+
+    def test_crz(self):
+        self.circuit.crz(1, 0, 1)
+        self.circuit.measure_all()
+        result = (
+            BasicSimulator().run(self.circuit, shots=self.shots, seed_simulator=self.seed).result()
+        )
+        self.assertEqual(result.success, True)
+
+    def test_cswap(self):
+        self.circuit.cswap(0, 1, 2)
+        self.circuit.measure_all()
+        result = (
+            BasicSimulator().run(self.circuit, shots=self.shots, seed_simulator=self.seed).result()
+        )
+        self.assertEqual(result.success, True)
+
+    def test_cu1(self):
+        self.circuit.append(lib.CU1Gate(1), [1, 2])
+        self.circuit.measure_all()
+        result = (
+            BasicSimulator().run(self.circuit, shots=self.shots, seed_simulator=self.seed).result()
+        )
+        self.assertEqual(result.success, True)
+
+    def test_cu3(self):
+        self.circuit.append(lib.CU3Gate(1, 2, 3), [1, 2])
+        self.circuit.measure_all()
+        result = (
+            BasicSimulator().run(self.circuit, shots=self.shots, seed_simulator=self.seed).result()
+        )
+        self.assertEqual(result.success, True)
+
+    def test_cx(self):
+        self.circuit.cx(1, 2)
+        self.circuit.measure_all()
+        result = (
+            BasicSimulator().run(self.circuit, shots=self.shots, seed_simulator=self.seed).result()
+        )
+        self.assertEqual(result.success, True)
+
+    def test_ecr(self):
+        self.circuit.ecr(1, 2)
+        self.circuit.measure_all()
+        result = (
+            BasicSimulator().run(self.circuit, shots=self.shots, seed_simulator=self.seed).result()
+        )
+        self.assertEqual(result.success, True)
+
+    def test_cy(self):
+        self.circuit.cy(1, 2)
+        self.circuit.measure_all()
+        result = (
+            BasicSimulator().run(self.circuit, shots=self.shots, seed_simulator=self.seed).result()
+        )
+        self.assertEqual(result.success, True)
+
+    def test_cz(self):
+        self.circuit.cz(1, 2)
+        self.circuit.measure_all()
+        result = (
+            BasicSimulator().run(self.circuit, shots=self.shots, seed_simulator=self.seed).result()
+        )
+        self.assertEqual(result.success, True)
+
+    def test_h(self):
+        self.circuit.h(1)
+        self.circuit.measure_all()
+        result = (
+            BasicSimulator().run(self.circuit, shots=self.shots, seed_simulator=self.seed).result()
+        )
+        self.assertEqual(result.success, True)
+
+    def test_id(self):
+        self.circuit.id(1)
+        self.circuit.measure_all()
+        result = (
+            BasicSimulator().run(self.circuit, shots=self.shots, seed_simulator=self.seed).result()
+        )
+        self.assertEqual(result.success, True)
+
+    def test_rx(self):
+        self.circuit.rx(1, 1)
+        self.circuit.measure_all()
+        result = (
+            BasicSimulator().run(self.circuit, shots=self.shots, seed_simulator=self.seed).result()
+        )
+        self.assertEqual(result.success, True)
+
+    def test_ry(self):
+        self.circuit.ry(1, 1)
+        self.circuit.measure_all()
+        result = (
+            BasicSimulator().run(self.circuit, shots=self.shots, seed_simulator=self.seed).result()
+        )
+        self.assertEqual(result.success, True)
+
+    def test_rz(self):
+        self.circuit.rz(1, 1)
+        self.circuit.measure_all()
+        result = (
+            BasicSimulator().run(self.circuit, shots=self.shots, seed_simulator=self.seed).result()
+        )
+        self.assertEqual(result.success, True)
+
+    def test_rxx(self):
+        self.circuit.rxx(1, 1, 0)
+        self.circuit.measure_all()
+        result = (
+            BasicSimulator().run(self.circuit, shots=self.shots, seed_simulator=self.seed).result()
+        )
+        self.assertEqual(result.success, True)
+
+    def test_rzx(self):
+        self.circuit.rzx(1, 1, 0)
+        self.circuit.measure_all()
+        result = (
+            BasicSimulator().run(self.circuit, shots=self.shots, seed_simulator=self.seed).result()
+        )
+        self.assertEqual(result.success, True)
+
+    def test_ryy(self):
+        self.circuit.ryy(1, 1, 0)
+        self.circuit.measure_all()
+        result = (
+            BasicSimulator().run(self.circuit, shots=self.shots, seed_simulator=self.seed).result()
+        )
+        self.assertEqual(result.success, True)
+
+    def test_rzz(self):
+        self.circuit.rzz(1, 1, 2)
+        self.circuit.measure_all()
+        result = (
+            BasicSimulator().run(self.circuit, shots=self.shots, seed_simulator=self.seed).result()
+        )
+        self.assertEqual(result.success, True)
+
+    def test_s(self):
+        self.circuit.s(1)
+        self.circuit.measure_all()
+        result = (
+            BasicSimulator().run(self.circuit, shots=self.shots, seed_simulator=self.seed).result()
+        )
+        self.assertEqual(result.success, True)
+
+    def test_sdg(self):
+        self.circuit.sdg(1)
+        self.circuit.measure_all()
+        result = (
+            BasicSimulator().run(self.circuit, shots=self.shots, seed_simulator=self.seed).result()
+        )
+        self.assertEqual(result.success, True)
+
+    def test_sx(self):
+        self.circuit.sx(1)
+        self.circuit.measure_all()
+        result = (
+            BasicSimulator().run(self.circuit, shots=self.shots, seed_simulator=self.seed).result()
+        )
+        self.assertEqual(result.success, True)
+
+    def test_sxdg(self):
+        self.circuit.sxdg(1)
+        self.circuit.measure_all()
+        result = (
+            BasicSimulator().run(self.circuit, shots=self.shots, seed_simulator=self.seed).result()
+        )
+        self.assertEqual(result.success, True)
+
+    def test_swap(self):
+        self.circuit.swap(1, 2)
+        self.circuit.measure_all()
+        result = (
+            BasicSimulator().run(self.circuit, shots=self.shots, seed_simulator=self.seed).result()
+        )
+        self.assertEqual(result.success, True)
+
+    def test_iswap(self):
+        self.circuit.iswap(1, 0)
+        self.circuit.measure_all()
+        result = (
+            BasicSimulator().run(self.circuit, shots=self.shots, seed_simulator=self.seed).result()
+        )
+        self.assertEqual(result.success, True)
+
+    def test_p(self):
+        self.circuit.p(1, 0)
+        self.circuit.measure_all()
+        result = (
+            BasicSimulator().run(self.circuit, shots=self.shots, seed_simulator=self.seed).result()
+        )
+        self.assertEqual(result.success, True)
+
+    def test_r(self):
+        self.circuit.r(0.5, 0, 1)
+        self.circuit.measure_all()
+        result = (
+            BasicSimulator().run(self.circuit, shots=self.shots, seed_simulator=self.seed).result()
+        )
+        self.assertEqual(result.success, True)
+
+    def test_t(self):
+        self.circuit.t(1)
+        self.circuit.measure_all()
+        result = (
+            BasicSimulator().run(self.circuit, shots=self.shots, seed_simulator=self.seed).result()
+        )
+        self.assertEqual(result.success, True)
+
+    def test_tdg(self):
+        self.circuit.tdg(1)
+        self.circuit.measure_all()
+        result = (
+            BasicSimulator().run(self.circuit, shots=self.shots, seed_simulator=self.seed).result()
+        )
+        self.assertEqual(result.success, True)
+
+    def test_x(self):
+        self.circuit.x(1)
+        self.circuit.measure_all()
+        result = (
+            BasicSimulator().run(self.circuit, shots=self.shots, seed_simulator=self.seed).result()
+        )
+        self.assertEqual(result.success, True)
+
+    def test_y(self):
+        self.circuit.y(1)
+        self.circuit.measure_all()
+        result = (
+            BasicSimulator().run(self.circuit, shots=self.shots, seed_simulator=self.seed).result()
+        )
+        self.assertEqual(result.success, True)
+
+    def test_z(self):
+        self.circuit.z(1)
+        self.circuit.measure_all()
+        result = (
+            BasicSimulator().run(self.circuit, shots=self.shots, seed_simulator=self.seed).result()
+        )
+        self.assertEqual(result.success, True)
+
+    def test_cs(self):
+        self.circuit.cs(0, 1)
+        self.circuit.measure_all()
+        result = (
+            BasicSimulator().run(self.circuit, shots=self.shots, seed_simulator=self.seed).result()
+        )
+        self.assertEqual(result.success, True)
+
+    def test_csdg(self):
+        self.circuit.csdg(0, 1)
+        self.circuit.measure_all()
+        result = (
+            BasicSimulator().run(self.circuit, shots=self.shots, seed_simulator=self.seed).result()
+        )
+        self.assertEqual(result.success, True)
+
+    def test_csx(self):
+        self.circuit.csx(0, 1)
+        self.circuit.measure_all()
+        result = (
+            BasicSimulator().run(self.circuit, shots=self.shots, seed_simulator=self.seed).result()
+        )
+        self.assertEqual(result.success, True)
+
+    def test_cu(self):
+        self.circuit.cu(0.5, 0.5, 0.5, 0.5, 0, 1)
+        self.circuit.measure_all()
+        result = (
+            BasicSimulator().run(self.circuit, shots=self.shots, seed_simulator=self.seed).result()
+        )
+        self.assertEqual(result.success, True)
+
+    def test_dcx(self):
+        self.circuit.dcx(0, 1)
+        self.circuit.measure_all()
+        result = (
+            BasicSimulator().run(self.circuit, shots=self.shots, seed_simulator=self.seed).result()
+        )
+        self.assertEqual(result.success, True)
+
+    def test_delay(self):
+        self.circuit.delay(0, 1)
+        self.circuit.measure_all()
+        result = (
+            BasicSimulator().run(self.circuit, shots=self.shots, seed_simulator=self.seed).result()
+        )
+        self.assertEqual(result.success, True)
+
+    def test_reset(self):
+        self.circuit.reset(1)
+        self.circuit.measure_all()
+        result = (
+            BasicSimulator().run(self.circuit, shots=self.shots, seed_simulator=self.seed).result()
+        )
+        self.assertEqual(result.success, True)
+
+    def test_rcx(self):
+        self.circuit.rccx(0, 1, 2)
+        self.circuit.measure_all()
+        result = (
+            BasicSimulator().run(self.circuit, shots=self.shots, seed_simulator=self.seed).result()
+        )
+        self.assertEqual(result.success, True)
+
+    def test_global_phase(self):
+        qc = self.circuit
+        qc.append(lib.GlobalPhaseGate(0.1), [])
+        self.circuit.measure_all()
+        result = (
+            BasicSimulator().run(self.circuit, shots=self.shots, seed_simulator=self.seed).result()
+        )
+        self.assertEqual(result.success, True)
+
+    def test_xx_minus_yy(self):
+        self.circuit.append(lib.XXMinusYYGate(0.1, 0.2), [0, 1])
+        self.circuit.measure_all()
+        result = (
+            BasicSimulator().run(self.circuit, shots=self.shots, seed_simulator=self.seed).result()
+        )
+        self.assertEqual(result.success, True)
+
+    def test_xx_plus_yy(self):
+        self.circuit.append(lib.XXPlusYYGate(0.1, 0.2), [0, 1])
+        self.circuit.measure_all()
+        result = (
+            BasicSimulator().run(self.circuit, shots=self.shots, seed_simulator=self.seed).result()
+        )
+        self.assertEqual(result.success, True)
+
+
+class TestStandardGatesTarget(QiskitTestCase):
+    """Standard gates, up to 3 qubits, as a target"""
+
+    def test_target(self):
+        target = BasicSimulator().target
+        expected = {
+            "cz",
+            "u3",
+            "p",
+            "cswap",
+            "z",
+            "cu1",
+            "ecr",
+            "reset",
+            "ch",
+            "cy",
+            "dcx",
+            "crx",
+            "sx",
+            "unitary",
+            "csdg",
+            "rzz",
+            "measure",
+            "swap",
+            "csx",
+            "y",
+            "s",
+            "xx_plus_yy",
+            "cs",
+            "h",
+            "t",
+            "u",
+            "rxx",
+            "cu",
+            "rzx",
+            "ry",
+            "rx",
+            "cu3",
+            "tdg",
+            "u2",
+            "xx_minus_yy",
+            "global_phase",
+            "u1",
+            "id",
+            "cx",
+            "cp",
+            "rz",
+            "sxdg",
+            "x",
+            "ryy",
+            "sdg",
+            "ccz",
+            "delay",
+            "crz",
+            "iswap",
+            "ccx",
+            "cry",
+            "rccx",
+            "r",
+        }
+        self.assertEqual(set(target.operation_names), expected)
+
+
+if __name__ == "__main__":
+    unittest.main(verbosity=2)
diff --git a/test/python/providers/test_backend_v2.py b/test/python/providers/test_backend_v2.py
index 6b974df902cd..70330085b1ab 100644
--- a/test/python/providers/test_backend_v2.py
+++ b/test/python/providers/test_backend_v2.py
@@ -19,6 +19,7 @@
 
 from ddt import ddt, data
 
+from numpy.testing import assert_array_max_ulp
 from qiskit.circuit import QuantumCircuit, ClassicalRegister, QuantumRegister
 from qiskit.circuit.library.standard_gates import (
     CXGate,
@@ -66,9 +67,11 @@ def assertMatchesTargetConstraints(self, tqc, target):
     def test_qubit_properties(self):
         """Test that qubit properties are returned as expected."""
         props = self.backend.qubit_properties([1, 0])
-        self.assertEqual([0.0001697368029059364, 0.00017739560485559633], [x.t1 for x in props])
-        self.assertEqual([0.00010941773478876496, 0.00014388784397520525], [x.t2 for x in props])
-        self.assertEqual([5487811175.818378, 5429298959.955691], [x.frequency for x in props])
+        assert_array_max_ulp([0.0001697368029059364, 0.00017739560485559633], [x.t1 for x in props])
+        assert_array_max_ulp(
+            [0.00010941773478876496, 0.00014388784397520525], [x.t2 for x in props]
+        )
+        assert_array_max_ulp([5487811175.818378, 5429298959.955691], [x.frequency for x in props])
 
     def test_legacy_qubit_properties(self):
         """Test that qubit props work for backends not using properties in target."""
@@ -82,9 +85,11 @@ def qubit_properties(self, qubit):
                 return [self.target.qubit_properties[i] for i in qubit]
 
         props = FakeBackendV2LegacyQubitProps(num_qubits=2, seed=42).qubit_properties([1, 0])
-        self.assertEqual([0.0001697368029059364, 0.00017739560485559633], [x.t1 for x in props])
-        self.assertEqual([0.00010941773478876496, 0.00014388784397520525], [x.t2 for x in props])
-        self.assertEqual([5487811175.818378, 5429298959.955691], [x.frequency for x in props])
+        assert_array_max_ulp([0.0001697368029059364, 0.00017739560485559633], [x.t1 for x in props])
+        assert_array_max_ulp(
+            [0.00010941773478876496, 0.00014388784397520525], [x.t2 for x in props]
+        )
+        assert_array_max_ulp([5487811175.818378, 5429298959.955691], [x.frequency for x in props])
 
     def test_no_qubit_properties_raises(self):
         """Ensure that if no qubit properties are defined we raise correctly."""
diff --git a/test/python/pulse/test_parameter_manager.py b/test/python/pulse/test_parameter_manager.py
index 54268af14577..0b91aaeaab4a 100644
--- a/test/python/pulse/test_parameter_manager.py
+++ b/test/python/pulse/test_parameter_manager.py
@@ -515,6 +515,44 @@ def test_parametric_pulses_with_parameter_vector(self):
         self.assertEqual(sched2.instructions[0][1].pulse.sigma, 4.0)
         self.assertEqual(sched2.instructions[1][1].phase, 0.1)
 
+    def test_pulse_assignment_with_parameter_names(self):
+        """Test pulse assignment with parameter names."""
+        sigma = Parameter("sigma")
+        amp = Parameter("amp")
+        param_vec = ParameterVector("param_vec", 2)
+
+        waveform = pulse.library.Gaussian(duration=128, sigma=sigma, amp=amp)
+        waveform2 = pulse.library.Gaussian(duration=128, sigma=40, amp=amp)
+        block = pulse.ScheduleBlock()
+        block += pulse.Play(waveform, pulse.DriveChannel(10))
+        block += pulse.Play(waveform2, pulse.DriveChannel(10))
+        block += pulse.ShiftPhase(param_vec[0], pulse.DriveChannel(10))
+        block += pulse.ShiftPhase(param_vec[1], pulse.DriveChannel(10))
+        block1 = block.assign_parameters(
+            {"amp": 0.2, "sigma": 4, "param_vec": [3.14, 1.57]}, inplace=False
+        )
+
+        self.assertEqual(block1.blocks[0].pulse.amp, 0.2)
+        self.assertEqual(block1.blocks[0].pulse.sigma, 4.0)
+        self.assertEqual(block1.blocks[1].pulse.amp, 0.2)
+        self.assertEqual(block1.blocks[2].phase, 3.14)
+        self.assertEqual(block1.blocks[3].phase, 1.57)
+
+        sched = pulse.Schedule()
+        sched += pulse.Play(waveform, pulse.DriveChannel(10))
+        sched += pulse.Play(waveform2, pulse.DriveChannel(10))
+        sched += pulse.ShiftPhase(param_vec[0], pulse.DriveChannel(10))
+        sched += pulse.ShiftPhase(param_vec[1], pulse.DriveChannel(10))
+        sched1 = sched.assign_parameters(
+            {"amp": 0.2, "sigma": 4, "param_vec": [3.14, 1.57]}, inplace=False
+        )
+
+        self.assertEqual(sched1.instructions[0][1].pulse.amp, 0.2)
+        self.assertEqual(sched1.instructions[0][1].pulse.sigma, 4.0)
+        self.assertEqual(sched1.instructions[1][1].pulse.amp, 0.2)
+        self.assertEqual(sched1.instructions[2][1].phase, 3.14)
+        self.assertEqual(sched1.instructions[3][1].phase, 1.57)
+
 
 class TestScheduleTimeslots(QiskitTestCase):
     """Test for edge cases of timing overlap on parametrized channels.
diff --git a/test/python/qasm3/test_export.py b/test/python/qasm3/test_export.py
index 3bb1667992a1..8589576441a2 100644
--- a/test/python/qasm3/test_export.py
+++ b/test/python/qasm3/test_export.py
@@ -24,7 +24,7 @@
 
 from qiskit import QuantumRegister, ClassicalRegister, QuantumCircuit, transpile
 from qiskit.circuit import Parameter, Qubit, Clbit, Instruction, Gate, Delay, Barrier
-from qiskit.circuit.classical import expr
+from qiskit.circuit.classical import expr, types
 from qiskit.circuit.controlflow import CASE_DEFAULT
 from qiskit.qasm3 import Exporter, dumps, dump, QASM3ExporterError, ExperimentalFeatures
 from qiskit.qasm3.exporter import QASM3Builder
@@ -948,7 +948,7 @@ def test_old_alias_classical_registers_option(self):
 
     def test_simple_for_loop(self):
         """Test that a simple for loop outputs the expected result."""
-        parameter = Parameter("x")
+        parameter = Parameter("my_x")
         loop_body = QuantumCircuit(1)
         loop_body.rx(parameter, 0)
         loop_body.break_loop()
@@ -978,8 +978,8 @@ def test_simple_for_loop(self):
 
     def test_nested_for_loop(self):
         """Test that a for loop nested inside another outputs the expected result."""
-        inner_parameter = Parameter("x")
-        outer_parameter = Parameter("y")
+        inner_parameter = Parameter("my_x")
+        outer_parameter = Parameter("my_y")
 
         inner_body = QuantumCircuit(2)
         inner_body.rz(inner_parameter, 0)
@@ -1024,9 +1024,9 @@ def test_nested_for_loop(self):
     def test_regular_parameter_in_nested_for_loop(self):
         """Test that a for loop nested inside another outputs the expected result, including
         defining parameters that are used in nested loop scopes."""
-        inner_parameter = Parameter("x")
-        outer_parameter = Parameter("y")
-        regular_parameter = Parameter("t")
+        inner_parameter = Parameter("my_x")
+        outer_parameter = Parameter("my_y")
+        regular_parameter = Parameter("my_t")
 
         inner_body = QuantumCircuit(2)
         inner_body.h(0)
@@ -1471,6 +1471,17 @@ def test_parameters_and_registers_cannot_have_naming_clashes(self):
         self.assertIn("clash", parameter_name["name"])
         self.assertNotEqual(register_name["name"], parameter_name["name"])
 
+    def test_parameters_and_gates_cannot_have_naming_clashes(self):
+        """Test that parameters are renamed to avoid collisions with gate names."""
+        qc = QuantumCircuit(QuantumRegister(1, "q"))
+        qc.rz(Parameter("rz"), 0)
+
+        out_qasm = dumps(qc)
+        parameter_name = self.scalar_parameter_regex.search(out_qasm)
+        self.assertTrue(parameter_name)
+        self.assertIn("rz", parameter_name["name"])
+        self.assertNotEqual(parameter_name["name"], "rz")
+
     # Not necessarily all the reserved keywords, just a sensibly-sized subset.
     @data("bit", "const", "def", "defcal", "float", "gate", "include", "int", "let", "measure")
     def test_reserved_keywords_as_names_are_escaped(self, keyword):
@@ -1736,6 +1747,145 @@ def test_no_unnecessary_cast(self):
 bit[8] cr;
 if (cr == 1) {
 }
+"""
+        self.assertEqual(dumps(qc), expected)
+
+    def test_var_use(self):
+        """Test that input and declared vars work in simple local scopes and can be set."""
+        qc = QuantumCircuit()
+        a = qc.add_input("a", types.Bool())
+        b = qc.add_input("b", types.Uint(8))
+        qc.store(a, expr.logic_not(a))
+        qc.store(b, expr.bit_and(b, 8))
+        qc.add_var("c", expr.bit_not(b))
+        # All inputs should come first, regardless of declaration order.
+        qc.add_input("d", types.Bool())
+
+        expected = """\
+OPENQASM 3.0;
+include "stdgates.inc";
+input bool a;
+input uint[8] b;
+input bool d;
+uint[8] c;
+a = !a;
+b = b & 8;
+c = ~b;
+"""
+        self.assertEqual(dumps(qc), expected)
+
+    def test_var_use_in_scopes(self):
+        """Test that usage of `Var` nodes works in capturing scopes."""
+        qc = QuantumCircuit(2, 2)
+        a = qc.add_input("a", types.Bool())
+        b_outer = qc.add_var("b", expr.lift(5, types.Uint(16)))
+        with qc.if_test(expr.logic_not(a)) as else_:
+            qc.store(b_outer, expr.bit_not(b_outer))
+            qc.h(0)
+        with else_:
+            # Shadow of the same type.
+            qc.add_var("b", expr.lift(7, b_outer.type))
+        with qc.while_loop(a):
+            # Shadow of a different type.
+            qc.add_var("b", a)
+        with qc.switch(b_outer) as case:
+            with case(0):
+                qc.store(b_outer, expr.lift(3, b_outer.type))
+            with case(case.DEFAULT):
+                qc.add_var("b", expr.logic_not(a))
+                qc.cx(0, 1)
+        qc.measure([0, 1], [0, 1])
+        expected = """\
+OPENQASM 3.0;
+include "stdgates.inc";
+input bool a;
+bit[2] c;
+int switch_dummy;
+qubit[2] q;
+uint[16] b;
+b = 5;
+if (!a) {
+  b = ~b;
+  h q[0];
+} else {
+  uint[16] b;
+  b = 7;
+}
+while (a) {
+  bool b;
+  b = a;
+}
+switch_dummy = b;
+switch (switch_dummy) {
+  case 0 {
+    b = 3;
+  }
+  default {
+    bool b;
+    b = !a;
+    cx q[0], q[1];
+  }
+}
+c[0] = measure q[0];
+c[1] = measure q[1];
+"""
+        self.assertEqual(dumps(qc), expected)
+
+    def test_var_naming_clash_parameter(self):
+        """We should support a `Var` clashing in name with a `Parameter` if `QuantumCircuit` allows
+        it."""
+        qc = QuantumCircuit(1)
+        qc.add_var("a", False)
+        qc.rx(Parameter("a"), 0)
+        expected = """\
+OPENQASM 3.0;
+include "stdgates.inc";
+input float[64] a;
+qubit[1] q;
+bool a__generated0;
+a__generated0 = false;
+rx(a) q[0];
+"""
+        self.assertEqual(dumps(qc), expected)
+
+    def test_var_naming_clash_register(self):
+        """We should support a `Var` clashing in name with a `Register` if `QuantumCircuit` allows
+        it."""
+        qc = QuantumCircuit(QuantumRegister(2, "q"), ClassicalRegister(2, "c"))
+        qc.add_input("c", types.Bool())
+        qc.add_var("q", False)
+        expected = """\
+OPENQASM 3.0;
+include "stdgates.inc";
+input bool c__generated0;
+bit[2] c;
+qubit[2] q;
+bool q__generated1;
+q__generated1 = false;
+"""
+        self.assertEqual(dumps(qc), expected)
+
+    def test_var_naming_clash_gate(self):
+        """We should support a `Var` clashing in name with some gate if `QuantumCircuit` allows
+        it."""
+        qc = QuantumCircuit(2)
+        qc.add_input("cx", types.Bool())
+        qc.add_input("U", types.Bool())
+        qc.add_var("rx", expr.lift(5, types.Uint(8)))
+
+        qc.cx(0, 1)
+        qc.u(0.5, 0.125, 0.25, 0)
+        # We don't actually use `rx`, but it's still in the `stdgates` include.
+        expected = """\
+OPENQASM 3.0;
+include "stdgates.inc";
+input bool cx__generated0;
+input bool U__generated1;
+qubit[2] q;
+uint[8] rx__generated2;
+rx__generated2 = 5;
+cx q[0], q[1];
+U(0.5, 0.125, 0.25) q[0];
 """
         self.assertEqual(dumps(qc), expected)
 
@@ -2654,3 +2804,11 @@ def test_disallow_opaque_instruction(self):
             QASM3ExporterError, "Exporting opaque instructions .* is not yet supported"
         ):
             exporter.dumps(qc)
+
+    def test_disallow_export_of_inner_scope(self):
+        """A circuit with captures can't be a top-level OQ3 program."""
+        qc = QuantumCircuit(captures=[expr.Var.new("a", types.Bool())])
+        with self.assertRaisesRegex(
+            QASM3ExporterError, "cannot export an inner scope.*as a top-level program"
+        ):
+            dumps(qc)
diff --git a/test/python/quantum_info/operators/symplectic/test_sparse_pauli_op.py b/test/python/quantum_info/operators/symplectic/test_sparse_pauli_op.py
index fdbfb4d4201d..330fd53bc35d 100644
--- a/test/python/quantum_info/operators/symplectic/test_sparse_pauli_op.py
+++ b/test/python/quantum_info/operators/symplectic/test_sparse_pauli_op.py
@@ -15,9 +15,11 @@
 import itertools as it
 import unittest
 import numpy as np
+import scipy.sparse
 import rustworkx as rx
 from ddt import ddt
 
+
 from qiskit import QiskitError
 from qiskit.circuit import ParameterExpression, Parameter, ParameterVector
 from qiskit.circuit.parametertable import ParameterView
@@ -259,6 +261,36 @@ def test_to_matrix_large(self):
         np.testing.assert_array_equal(spp_op.to_matrix(), target)
         np.testing.assert_array_equal(spp_op.to_matrix(sparse=True).toarray(), target)
 
+    def test_to_matrix_zero(self):
+        """Test `to_matrix` with a zero operator."""
+        num_qubits = 4
+        zero_numpy = np.zeros((2**num_qubits, 2**num_qubits), dtype=np.complex128)
+        zero = SparsePauliOp.from_list([], num_qubits=num_qubits)
+
+        zero_dense = zero.to_matrix(sparse=False)
+        np.testing.assert_array_equal(zero_dense, zero_numpy)
+
+        zero_sparse = zero.to_matrix(sparse=True)
+        self.assertIsInstance(zero_sparse, scipy.sparse.csr_matrix)
+        np.testing.assert_array_equal(zero_sparse.A, zero_numpy)
+
+    def test_to_matrix_parallel_vs_serial(self):
+        """Parallel execution should produce the same results as serial execution up to
+        floating-point associativity effects."""
+        # Using powers-of-two coefficients to make floating-point arithmetic associative so we can
+        # do bit-for-bit assertions.  Choose labels that have at least few overlapping locations.
+        labels = ["XZIXYX", "YIIYXY", "ZZZIIZ", "IIIIII"]
+        coeffs = [0.25, 0.125j, 0.5 - 0.25j, -0.125 + 0.5j]
+        op = SparsePauliOp(labels, coeffs)
+        np.testing.assert_array_equal(
+            op.to_matrix(sparse=True, force_serial=False).toarray(),
+            op.to_matrix(sparse=True, force_serial=True).toarray(),
+        )
+        np.testing.assert_array_equal(
+            op.to_matrix(sparse=False, force_serial=False),
+            op.to_matrix(sparse=False, force_serial=True),
+        )
+
     def test_to_matrix_parameters(self):
         """Test to_matrix method for parameterized SparsePauliOp."""
         labels = ["XI", "YZ", "YY", "ZZ"]
diff --git a/test/python/quantum_info/operators/test_operator.py b/test/python/quantum_info/operators/test_operator.py
index fc824643a0ba..d653d6182017 100644
--- a/test/python/quantum_info/operators/test_operator.py
+++ b/test/python/quantum_info/operators/test_operator.py
@@ -17,6 +17,7 @@
 import unittest
 import logging
 import copy
+
 from test import combine
 import numpy as np
 from ddt import ddt
@@ -26,6 +27,7 @@
 from qiskit import QiskitError
 from qiskit import QuantumRegister, ClassicalRegister, QuantumCircuit
 from qiskit.circuit.library import HGate, CHGate, CXGate, QFT
+from qiskit.transpiler import CouplingMap
 from qiskit.transpiler.layout import Layout, TranspileLayout
 from qiskit.quantum_info.operators import Operator, ScalarOp
 from qiskit.quantum_info.operators.predicates import matrix_equal
@@ -735,6 +737,28 @@ def test_from_circuit_constructor_no_layout(self):
         global_phase_equivalent = matrix_equal(op.data, target, ignore_phase=True)
         self.assertTrue(global_phase_equivalent)
 
+    def test_from_circuit_initial_layout_final_layout(self):
+        """Test initialization from a circuit with a non-trivial initial_layout and final_layout as given
+        by a transpiled circuit."""
+        qc = QuantumCircuit(5)
+        qc.h(0)
+        qc.cx(2, 1)
+        qc.cx(1, 2)
+        qc.cx(1, 0)
+        qc.cx(1, 3)
+        qc.cx(1, 4)
+        qc.h(2)
+
+        qc_transpiled = transpile(
+            qc,
+            coupling_map=CouplingMap.from_line(5),
+            initial_layout=[2, 3, 4, 0, 1],
+            optimization_level=1,
+            seed_transpiler=17,
+        )
+
+        self.assertTrue(Operator.from_circuit(qc_transpiled).equiv(qc))
+
     def test_from_circuit_constructor_reverse_embedded_layout(self):
         """Test initialization from a circuit with an embedded reverse layout."""
         # Test tensor product of 1-qubit gates
@@ -817,7 +841,7 @@ def test_from_circuit_constructor_reverse_embedded_layout_and_final_layout(self)
         circuit._layout = TranspileLayout(
             Layout({circuit.qubits[2]: 0, circuit.qubits[1]: 1, circuit.qubits[0]: 2}),
             {qubit: index for index, qubit in enumerate(circuit.qubits)},
-            Layout({circuit.qubits[0]: 1, circuit.qubits[1]: 2, circuit.qubits[2]: 0}),
+            Layout({circuit.qubits[0]: 2, circuit.qubits[1]: 0, circuit.qubits[2]: 1}),
         )
         circuit.swap(0, 1)
         circuit.swap(1, 2)
@@ -839,7 +863,7 @@ def test_from_circuit_constructor_reverse_embedded_layout_and_manual_final_layou
             Layout({circuit.qubits[2]: 0, circuit.qubits[1]: 1, circuit.qubits[0]: 2}),
             {qubit: index for index, qubit in enumerate(circuit.qubits)},
         )
-        final_layout = Layout({circuit.qubits[0]: 1, circuit.qubits[1]: 2, circuit.qubits[2]: 0})
+        final_layout = Layout({circuit.qubits[0]: 2, circuit.qubits[1]: 0, circuit.qubits[2]: 1})
         circuit.swap(0, 1)
         circuit.swap(1, 2)
         op = Operator.from_circuit(circuit, final_layout=final_layout)
@@ -966,7 +990,7 @@ def test_from_circuit_constructor_empty_layout(self):
         circuit.h(0)
         circuit.cx(0, 1)
         layout = Layout()
-        with self.assertRaises(IndexError):
+        with self.assertRaises(KeyError):
             Operator.from_circuit(circuit, layout=layout)
 
     def test_compose_scalar(self):
@@ -1078,6 +1102,27 @@ def test_from_circuit_mixed_reg_loose_bits_transpiled(self):
         result = Operator.from_circuit(tqc)
         self.assertTrue(Operator(circuit).equiv(result))
 
+    def test_from_circuit_into_larger_map(self):
+        """Test from_circuit method when the number of physical
+        qubits is larger than the number of original virtual qubits."""
+
+        # original circuit on 3 qubits
+        qc = QuantumCircuit(3)
+        qc.h(0)
+        qc.cx(0, 1)
+        qc.cx(1, 2)
+
+        # transpile into 5-qubits
+        tqc = transpile(qc, coupling_map=CouplingMap.from_line(5), initial_layout=[0, 2, 4])
+
+        # qc expanded with ancilla qubits
+        expected = QuantumCircuit(5)
+        expected.h(0)
+        expected.cx(0, 1)
+        expected.cx(1, 2)
+
+        self.assertEqual(Operator.from_circuit(tqc), Operator(expected))
+
     def test_apply_permutation_back(self):
         """Test applying permutation to the operator,
         where the operator is applied first and the permutation second."""
diff --git a/test/python/quantum_info/states/test_utils.py b/test/python/quantum_info/states/test_utils.py
index 9a9015944e70..1382963ed554 100644
--- a/test/python/quantum_info/states/test_utils.py
+++ b/test/python/quantum_info/states/test_utils.py
@@ -113,14 +113,14 @@ def test_schmidt_decomposition_3_level_system(self):
 
         # check decomposition elements
         self.assertAlmostEqual(schmidt_comps[0][0], 1 / np.sqrt(3))
-        self.assertEqual(schmidt_comps[0][1], Statevector(np.array([1, 0, 0]), dims=(3)))
-        self.assertEqual(schmidt_comps[0][2], Statevector(np.array([1, 0, 0]), dims=(3)))
+        self.assertEqual(schmidt_comps[0][1], Statevector(np.array([1, 0, 0]), dims=3))
+        self.assertEqual(schmidt_comps[0][2], Statevector(np.array([1, 0, 0]), dims=3))
         self.assertAlmostEqual(schmidt_comps[1][0], 1 / np.sqrt(3))
-        self.assertEqual(schmidt_comps[1][1], Statevector(np.array([0, 1, 0]), dims=(3)))
-        self.assertEqual(schmidt_comps[1][2], Statevector(np.array([0, 1, 0]), dims=(3)))
+        self.assertEqual(schmidt_comps[1][1], Statevector(np.array([0, 1, 0]), dims=3))
+        self.assertEqual(schmidt_comps[1][2], Statevector(np.array([0, 1, 0]), dims=3))
         self.assertAlmostEqual(schmidt_comps[2][0], 1 / np.sqrt(3))
-        self.assertEqual(schmidt_comps[2][1], Statevector(np.array([0, 0, 1]), dims=(3)))
-        self.assertEqual(schmidt_comps[2][2], Statevector(np.array([0, 0, 1]), dims=(3)))
+        self.assertEqual(schmidt_comps[2][1], Statevector(np.array([0, 0, 1]), dims=3))
+        self.assertEqual(schmidt_comps[2][2], Statevector(np.array([0, 0, 1]), dims=3))
 
         # check that state can be properly reconstructed
         state = Statevector(
diff --git a/test/python/result/test_result.py b/test/python/result/test_result.py
index 1bbecb6b65e8..7d73ab2ebcf6 100644
--- a/test/python/result/test_result.py
+++ b/test/python/result/test_result.py
@@ -56,7 +56,7 @@ def test_counts_no_header(self):
         no_header_processed_counts = {
             bin(int(bs[2:], 16))[2:]: counts for (bs, counts) in raw_counts.items()
         }
-        data = models.ExperimentResultData(counts=dict(**raw_counts))
+        data = models.ExperimentResultData(counts=raw_counts)
         exp_result = models.ExperimentResult(shots=14, success=True, meas_level=2, data=data)
         result = Result(results=[exp_result], **self.base_result_args)
 
@@ -66,7 +66,7 @@ def test_counts_header(self):
         """Test that counts are extracted properly with header."""
         raw_counts = {"0x0": 4, "0x2": 10}
         processed_counts = {"0 0 00": 4, "0 0 10": 10}
-        data = models.ExperimentResultData(counts=dict(**raw_counts))
+        data = models.ExperimentResultData(counts=raw_counts)
         exp_result_header = QobjExperimentHeader(
             creg_sizes=[["c0", 2], ["c0", 1], ["c1", 1]], memory_slots=4
         )
@@ -81,7 +81,7 @@ def test_counts_by_name(self):
         """Test that counts are extracted properly by name."""
         raw_counts = {"0x0": 4, "0x2": 10}
         processed_counts = {"0 0 00": 4, "0 0 10": 10}
-        data = models.ExperimentResultData(counts=dict(**raw_counts))
+        data = models.ExperimentResultData(counts=raw_counts)
         exp_result_header = QobjExperimentHeader(
             creg_sizes=[["c0", 2], ["c0", 1], ["c1", 1]], memory_slots=4, name="a_name"
         )
@@ -107,7 +107,7 @@ def test_counts_duplicate_name(self):
     def test_result_repr(self):
         """Test that repr is contstructed correctly for a results object."""
         raw_counts = {"0x0": 4, "0x2": 10}
-        data = models.ExperimentResultData(counts=dict(**raw_counts))
+        data = models.ExperimentResultData(counts=raw_counts)
         exp_result_header = QobjExperimentHeader(
             creg_sizes=[["c0", 2], ["c0", 1], ["c1", 1]], memory_slots=4
         )
@@ -136,7 +136,7 @@ def test_multiple_circuits_counts(self):
         """
         raw_counts_1 = {"0x0": 5, "0x3": 12, "0x5": 9, "0xD": 6, "0xE": 2}
         processed_counts_1 = {"0000": 5, "0011": 12, "0101": 9, "1101": 6, "1110": 2}
-        data_1 = models.ExperimentResultData(counts=dict(**raw_counts_1))
+        data_1 = models.ExperimentResultData(counts=raw_counts_1)
         exp_result_header_1 = QobjExperimentHeader(creg_sizes=[["c0", 4]], memory_slots=4)
         exp_result_1 = models.ExperimentResult(
             shots=14, success=True, meas_level=2, data=data_1, header=exp_result_header_1
@@ -144,7 +144,7 @@ def test_multiple_circuits_counts(self):
 
         raw_counts_2 = {"0x1": 0, "0x4": 3, "0x6": 6, "0xA": 1, "0xB": 2}
         processed_counts_2 = {"0001": 0, "0100": 3, "0110": 6, "1010": 1, "1011": 2}
-        data_2 = models.ExperimentResultData(counts=dict(**raw_counts_2))
+        data_2 = models.ExperimentResultData(counts=raw_counts_2)
         exp_result_header_2 = QobjExperimentHeader(creg_sizes=[["c0", 4]], memory_slots=4)
         exp_result_2 = models.ExperimentResult(
             shots=14, success=True, meas_level=2, data=data_2, header=exp_result_header_2
@@ -152,7 +152,7 @@ def test_multiple_circuits_counts(self):
 
         raw_counts_3 = {"0xC": 27, "0xF": 20}
         processed_counts_3 = {"1100": 27, "1111": 20}
-        data_3 = models.ExperimentResultData(counts=dict(**raw_counts_3))
+        data_3 = models.ExperimentResultData(counts=raw_counts_3)
         exp_result_header_3 = QobjExperimentHeader(creg_sizes=[["c0", 4]], memory_slots=4)
         exp_result_3 = models.ExperimentResult(
             shots=14, success=True, meas_level=2, data=data_3, header=exp_result_header_3
@@ -171,7 +171,7 @@ def test_multiple_circuits_counts(self):
     def test_marginal_counts(self):
         """Test that counts are marginalized correctly."""
         raw_counts = {"0x0": 4, "0x1": 7, "0x2": 10, "0x6": 5, "0x9": 11, "0xD": 9, "0xE": 8}
-        data = models.ExperimentResultData(counts=dict(**raw_counts))
+        data = models.ExperimentResultData(counts=raw_counts)
         exp_result_header = QobjExperimentHeader(creg_sizes=[["c0", 4]], memory_slots=4)
         exp_result = models.ExperimentResult(
             shots=54, success=True, data=data, header=exp_result_header
@@ -322,7 +322,7 @@ def test_marginal_counts_result_inplace(self):
     def test_marginal_counts_result_creg_sizes(self):
         """Test that marginal_counts with Result input properly changes creg_sizes."""
         raw_counts = {"0x0": 4, "0x1": 7, "0x2": 10, "0x6": 5, "0x9": 11, "0xD": 9, "0xE": 8}
-        data = models.ExperimentResultData(counts=dict(**raw_counts))
+        data = models.ExperimentResultData(counts=raw_counts)
         exp_result_header = QobjExperimentHeader(creg_sizes=[["c0", 1], ["c1", 3]], memory_slots=4)
         exp_result = models.ExperimentResult(
             shots=54, success=True, data=data, header=exp_result_header
@@ -343,7 +343,7 @@ def test_marginal_counts_result_creg_sizes(self):
     def test_marginal_counts_result_format(self):
         """Test that marginal_counts with format_marginal true properly formats output."""
         raw_counts_1 = {"0x0": 4, "0x1": 7, "0x2": 10, "0x6": 5, "0x9": 11, "0xD": 9, "0x12": 8}
-        data_1 = models.ExperimentResultData(counts=dict(**raw_counts_1))
+        data_1 = models.ExperimentResultData(counts=raw_counts_1)
         exp_result_header_1 = QobjExperimentHeader(
             creg_sizes=[["c0", 2], ["c1", 3]], memory_slots=5
         )
@@ -368,14 +368,14 @@ def test_marginal_counts_result_format(self):
     def test_marginal_counts_inplace_true(self):
         """Test marginal_counts(Result, inplace = True)"""
         raw_counts_1 = {"0x0": 4, "0x1": 7, "0x2": 10, "0x6": 5, "0x9": 11, "0xD": 9, "0xE": 8}
-        data_1 = models.ExperimentResultData(counts=dict(**raw_counts_1))
+        data_1 = models.ExperimentResultData(counts=raw_counts_1)
         exp_result_header_1 = QobjExperimentHeader(creg_sizes=[["c0", 4]], memory_slots=4)
         exp_result_1 = models.ExperimentResult(
             shots=54, success=True, data=data_1, header=exp_result_header_1
         )
 
         raw_counts_2 = {"0x2": 5, "0x3": 8}
-        data_2 = models.ExperimentResultData(counts=dict(**raw_counts_2))
+        data_2 = models.ExperimentResultData(counts=raw_counts_2)
         exp_result_header_2 = QobjExperimentHeader(creg_sizes=[["c0", 2]], memory_slots=2)
         exp_result_2 = models.ExperimentResult(
             shots=13, success=True, data=data_2, header=exp_result_header_2
@@ -393,14 +393,14 @@ def test_marginal_counts_inplace_true(self):
     def test_marginal_counts_inplace_false(self):
         """Test marginal_counts(Result, inplace=False)"""
         raw_counts_1 = {"0x0": 4, "0x1": 7, "0x2": 10, "0x6": 5, "0x9": 11, "0xD": 9, "0xE": 8}
-        data_1 = models.ExperimentResultData(counts=dict(**raw_counts_1))
+        data_1 = models.ExperimentResultData(counts=raw_counts_1)
         exp_result_header_1 = QobjExperimentHeader(creg_sizes=[["c0", 4]], memory_slots=4)
         exp_result_1 = models.ExperimentResult(
             shots=54, success=True, data=data_1, header=exp_result_header_1
         )
 
         raw_counts_2 = {"0x2": 5, "0x3": 8}
-        data_2 = models.ExperimentResultData(counts=dict(**raw_counts_2))
+        data_2 = models.ExperimentResultData(counts=raw_counts_2)
         exp_result_header_2 = QobjExperimentHeader(creg_sizes=[["c0", 2]], memory_slots=2)
         exp_result_2 = models.ExperimentResult(
             shots=13, success=True, data=data_2, header=exp_result_header_2
@@ -689,7 +689,7 @@ def setUp(self):
     def test_counts_int_out(self):
         """Test that fails when get_count is called with a nonexistent int."""
         raw_counts = {"0x0": 4, "0x2": 10}
-        data = models.ExperimentResultData(counts=dict(**raw_counts))
+        data = models.ExperimentResultData(counts=raw_counts)
         exp_result = models.ExperimentResult(shots=14, success=True, meas_level=2, data=data)
         result = Result(results=[exp_result], **self.base_result_args)
 
@@ -702,7 +702,7 @@ def test_counts_int_out(self):
     def test_counts_name_out(self):
         """Test that fails when get_count is called with a nonexistent name."""
         raw_counts = {"0x0": 4, "0x2": 10}
-        data = models.ExperimentResultData(counts=dict(**raw_counts))
+        data = models.ExperimentResultData(counts=raw_counts)
         exp_result_header = QobjExperimentHeader(
             creg_sizes=[["c0", 2], ["c0", 1], ["c1", 1]], memory_slots=4, name="a_name"
         )
@@ -735,7 +735,7 @@ def test_memory_int_out(self):
     def test_marginal_counts_no_cregs(self):
         """Test that marginal_counts without cregs See qiskit-terra/6430."""
         raw_counts_1 = {"0x0": 4, "0x1": 7, "0x2": 10, "0x6": 5, "0x9": 11, "0xD": 9, "0x12": 8}
-        data_1 = models.ExperimentResultData(counts=dict(**raw_counts_1))
+        data_1 = models.ExperimentResultData(counts=raw_counts_1)
         exp_result_header_1 = QobjExperimentHeader(memory_slots=5)
         exp_result_1 = models.ExperimentResult(
             shots=54, success=True, data=data_1, header=exp_result_header_1
diff --git a/test/python/synthesis/test_permutation_synthesis.py b/test/python/synthesis/test_permutation_synthesis.py
index 7fc6f5e24ab8..5c4317ed58a3 100644
--- a/test/python/synthesis/test_permutation_synthesis.py
+++ b/test/python/synthesis/test_permutation_synthesis.py
@@ -19,8 +19,12 @@
 
 from qiskit.quantum_info.operators import Operator
 from qiskit.circuit.library import LinearFunction, PermutationGate
-from qiskit.synthesis import synth_permutation_acg
-from qiskit.synthesis.permutation import synth_permutation_depth_lnn_kms, synth_permutation_basic
+from qiskit.synthesis.permutation import (
+    synth_permutation_acg,
+    synth_permutation_depth_lnn_kms,
+    synth_permutation_basic,
+    synth_permutation_reverse_lnn_kms,
+)
 from qiskit.synthesis.permutation.permutation_utils import _get_ordered_swap
 from test import QiskitTestCase  # pylint: disable=wrong-import-order
 
@@ -108,6 +112,26 @@ def test_synth_permutation_depth_lnn_kms(self, width):
             synthesized_pattern = LinearFunction(qc).permutation_pattern()
             self.assertTrue(np.array_equal(synthesized_pattern, pattern))
 
+    @data(1, 2, 3, 4, 5, 10, 15, 20)
+    def test_synth_permutation_reverse_lnn_kms(self, num_qubits):
+        """Test synth_permutation_reverse_lnn_kms function produces the correct
+        circuit."""
+        pattern = list(reversed(range(num_qubits)))
+        qc = synth_permutation_reverse_lnn_kms(num_qubits)
+        self.assertListEqual((LinearFunction(qc).permutation_pattern()).tolist(), pattern)
+
+        # Check that the CX depth of the circuit is at 2*n+2
+        self.assertTrue(qc.depth() <= 2 * num_qubits + 2)
+
+        # Check that the synthesized circuit consists of CX gates only,
+        # and that these CXs adhere to the LNN connectivity.
+        for instruction in qc.data:
+            self.assertEqual(instruction.operation.name, "cx")
+            q0 = qc.find_bit(instruction.qubits[0]).index
+            q1 = qc.find_bit(instruction.qubits[1]).index
+            dist = abs(q0 - q1)
+            self.assertEqual(dist, 1)
+
     @data(4, 5, 6, 7)
     def test_permutation_matrix(self, width):
         """Test that the unitary matrix constructed from permutation pattern
diff --git a/test/python/synthesis/test_synthesis.py b/test/python/synthesis/test_synthesis.py
index d3f8560cb9b2..cb918b29146a 100644
--- a/test/python/synthesis/test_synthesis.py
+++ b/test/python/synthesis/test_synthesis.py
@@ -23,6 +23,7 @@
 from ddt import ddt, data
 
 from qiskit import QiskitError, transpile
+from qiskit.dagcircuit.dagcircuit import DAGCircuit
 from qiskit.circuit import QuantumCircuit, QuantumRegister
 from qiskit.converters import dag_to_circuit, circuit_to_dag
 from qiskit.circuit.library import (
@@ -270,6 +271,8 @@ def check_exact_decomposition(
     ):
         """Check exact decomposition for a particular target"""
         decomp_circuit = decomposer(target_unitary, _num_basis_uses=num_basis_uses)
+        if isinstance(decomp_circuit, DAGCircuit):
+            decomp_circuit = dag_to_circuit(decomp_circuit)
         if num_basis_uses is not None:
             self.assertEqual(num_basis_uses, decomp_circuit.count_ops().get("unitary", 0))
         decomp_unitary = Operator(decomp_circuit).data
@@ -1232,6 +1235,42 @@ def test_euler_basis_selection(self, euler_bases, kak_gates, seed):
             requested_basis = set(oneq_gates + [kak_gate_name])
             self.assertTrue(decomposition_basis.issubset(requested_basis))
 
+    @combine(
+        seed=range(10),
+        euler_bases=[
+            ("U321", ["u3", "u2", "u1"]),
+            ("U3", ["u3"]),
+            ("U", ["u"]),
+            ("U1X", ["u1", "rx"]),
+            ("RR", ["r"]),
+            ("PSX", ["p", "sx"]),
+            ("ZYZ", ["rz", "ry"]),
+            ("ZXZ", ["rz", "rx"]),
+            ("XYX", ["rx", "ry"]),
+            ("ZSX", ["rz", "sx"]),
+            ("ZSXX", ["rz", "sx", "x"]),
+        ],
+        kak_gates=[
+            (CXGate(), "cx"),
+            (CZGate(), "cz"),
+            (iSwapGate(), "iswap"),
+            (RXXGate(np.pi / 2), "rxx"),
+        ],
+        name="test_euler_basis_selection_{seed}_{euler_bases[0]}_{kak_gates[1]}",
+    )
+    def test_use_dag(self, euler_bases, kak_gates, seed):
+        """Test the use_dag flag returns a correct dagcircuit with various target bases."""
+        (euler_basis, oneq_gates) = euler_bases
+        (kak_gate, kak_gate_name) = kak_gates
+        with self.subTest(euler_basis=euler_basis, kak_gate=kak_gate):
+            decomposer = TwoQubitBasisDecomposer(kak_gate, euler_basis=euler_basis)
+            unitary = random_unitary(4, seed=seed)
+            self.assertIsInstance(decomposer(unitary, use_dag=True), DAGCircuit)
+            self.check_exact_decomposition(unitary.data, decomposer)
+            decomposition_basis = set(decomposer(unitary).count_ops())
+            requested_basis = set(oneq_gates + [kak_gate_name])
+            self.assertTrue(decomposition_basis.issubset(requested_basis))
+
 
 @ddt
 class TestPulseOptimalDecompose(CheckDecompositions):
diff --git a/test/python/transpiler/test_1q.py b/test/python/transpiler/test_1q.py
index 31975456f346..50bdc7b24643 100644
--- a/test/python/transpiler/test_1q.py
+++ b/test/python/transpiler/test_1q.py
@@ -1,6 +1,6 @@
 # This code is part of Qiskit.
 #
-# (C) Copyright IBM 2019.
+# (C) Copyright IBM 2019, 2024.
 #
 # This code is licensed under the Apache License, Version 2.0. You may
 # obtain a copy of this license in the LICENSE.txt file in the root directory
@@ -17,6 +17,7 @@
 from qiskit import QuantumCircuit
 from qiskit.compiler import transpile
 from qiskit.providers.fake_provider import Fake1Q
+from qiskit.providers.basic_provider import BasicSimulator
 from qiskit.transpiler import TranspilerError
 from test import combine  # pylint: disable=wrong-import-order
 from test import QiskitTestCase  # pylint: disable=wrong-import-order
@@ -76,9 +77,7 @@ def test_device(self, circuit, level):
         name="{circuit.__name__}_level{level}_valid",
     )
     def test_simulator(self, circuit, level):
-        """All the levels with all the 1Q simulator backend"""
-        # Set fake backend config to simulator
-        backend = Fake1Q()
-        backend._configuration.simulator = True
+        """All the levels with a simulator backend"""
+        backend = BasicSimulator()
         result = transpile(circuit(), backend=backend, optimization_level=level, seed_transpiler=42)
         self.assertIsInstance(result, QuantumCircuit)
diff --git a/test/python/transpiler/test_dynamical_decoupling.py b/test/python/transpiler/test_dynamical_decoupling.py
index 6460847b2e81..5f11ede3a3e8 100644
--- a/test/python/transpiler/test_dynamical_decoupling.py
+++ b/test/python/transpiler/test_dynamical_decoupling.py
@@ -447,7 +447,9 @@ class Echo(Gate):
             representation to satisfy PadDynamicalDecoupling's check.
             """
 
-            def __array__(self, dtype=None):
+            def __array__(self, dtype=None, copy=None):
+                if copy is False:
+                    raise ValueError("cannot produce matrix without calculation")
                 return np.eye(2, dtype=dtype)
 
         # A gate with one unbound and one bound parameter to leave in the final
diff --git a/test/python/transpiler/test_high_level_synthesis.py b/test/python/transpiler/test_high_level_synthesis.py
index 5ab78af8f581..0f074865f419 100644
--- a/test/python/transpiler/test_high_level_synthesis.py
+++ b/test/python/transpiler/test_high_level_synthesis.py
@@ -65,6 +65,7 @@
 )
 from test import QiskitTestCase  # pylint: disable=wrong-import-order
 
+
 # In what follows, we create two simple operations OpA and OpB, that potentially mimic
 # higher-level objects written by a user.
 # For OpA we define two synthesis methods:
@@ -586,6 +587,78 @@ def test_invert_and_transpose(self):
             self.assertEqual(qct.size(), 6)
             self.assertEqual(qct.depth(), 6)
 
+    def test_plugin_selection_all(self):
+        """Test setting plugin_selection to all."""
+
+        linear_function = LinearFunction(self.construct_linear_circuit(7))
+        qc = QuantumCircuit(7)
+        qc.append(linear_function, [0, 1, 2, 3, 4, 5, 6])
+
+        with self.subTest("sequential"):
+            # In the default "run sequential" mode, we stop as soon as a plugin
+            # in the list returns a circuit.
+            # For this specific example the default options lead to a suboptimal circuit.
+            hls_config = HLSConfig(linear_function=[("pmh", {}), ("pmh", {"use_inverted": True})])
+            qct = HighLevelSynthesis(hls_config=hls_config)(qc)
+            self.assertEqual(LinearFunction(qct), LinearFunction(qc))
+            self.assertEqual(qct.size(), 12)
+            self.assertEqual(qct.depth(), 8)
+
+        with self.subTest("all"):
+            # In the non-default "run all" mode, we examine all plugins in the list.
+            # For this specific example we get the better result for the second plugin in the list.
+            hls_config = HLSConfig(
+                linear_function=[("pmh", {}), ("pmh", {"use_inverted": True})],
+                plugin_selection="all",
+            )
+            qct = HighLevelSynthesis(hls_config=hls_config)(qc)
+            self.assertEqual(LinearFunction(qct), LinearFunction(qc))
+            self.assertEqual(qct.size(), 6)
+            self.assertEqual(qct.depth(), 6)
+
+    def test_plugin_selection_all_with_metrix(self):
+        """Test setting plugin_selection to all and specifying different evaluation functions."""
+
+        # The seed is chosen so that we get different best circuits depending on whether we
+        # want to minimize size or depth.
+        mat = random_invertible_binary_matrix(7, seed=37)
+        qc = QuantumCircuit(7)
+        qc.append(LinearFunction(mat), [0, 1, 2, 3, 4, 5, 6])
+
+        with self.subTest("size_fn"):
+            # We want to minimize the "size" (aka the number of gates) in the circuit
+            hls_config = HLSConfig(
+                linear_function=[
+                    ("pmh", {}),
+                    ("pmh", {"use_inverted": True}),
+                    ("pmh", {"use_transposed": True}),
+                    ("pmh", {"use_inverted": True, "use_transposed": True}),
+                ],
+                plugin_selection="all",
+                plugin_evaluation_fn=lambda qc: qc.size(),
+            )
+            qct = HighLevelSynthesis(hls_config=hls_config)(qc)
+            self.assertEqual(LinearFunction(qct), LinearFunction(qc))
+            self.assertEqual(qct.size(), 20)
+            self.assertEqual(qct.depth(), 15)
+
+        with self.subTest("depth_fn"):
+            # We want to minimize the "depth" (aka the number of layers) in the circuit
+            hls_config = HLSConfig(
+                linear_function=[
+                    ("pmh", {}),
+                    ("pmh", {"use_inverted": True}),
+                    ("pmh", {"use_transposed": True}),
+                    ("pmh", {"use_inverted": True, "use_transposed": True}),
+                ],
+                plugin_selection="all",
+                plugin_evaluation_fn=lambda qc: qc.depth(),
+            )
+            qct = HighLevelSynthesis(hls_config=hls_config)(qc)
+            self.assertEqual(LinearFunction(qct), LinearFunction(qc))
+            self.assertEqual(qct.size(), 23)
+            self.assertEqual(qct.depth(), 12)
+
 
 class TestKMSSynthesisLinearFunctionPlugin(QiskitTestCase):
     """Tests for the KMSSynthesisLinearFunction plugin for synthesizing linear functions."""
diff --git a/test/python/transpiler/test_layout.py b/test/python/transpiler/test_layout.py
index a77b8e55370a..d90645289542 100644
--- a/test/python/transpiler/test_layout.py
+++ b/test/python/transpiler/test_layout.py
@@ -13,12 +13,14 @@
 """Tests the layout object"""
 
 import copy
+import pickle
 import unittest
 import numpy
 
 from qiskit.circuit import QuantumRegister, Qubit
 from qiskit.transpiler.layout import Layout
 from qiskit.transpiler.exceptions import LayoutError
+from qiskit._accelerate.nlayout import NLayout
 from test import QiskitTestCase  # pylint: disable=wrong-import-order
 
 
@@ -511,5 +513,54 @@ def test_to_permutation(self):
         self.assertEqual(permutation, [1, 2, 0])
 
 
+class TestNLayout(QiskitTestCase):
+    """This is a private class, so mostly doesn't need direct tests."""
+
+    def test_pickle(self):
+        """Test that the layout roundtrips through pickle."""
+        v2p = [3, 5, 1, 2, 0, 4]
+        size = len(v2p)
+        layout = NLayout.from_virtual_to_physical(v2p)
+        self.assertEqual([layout.virtual_to_physical(x) for x in range(size)], v2p)
+        roundtripped = pickle.loads(pickle.dumps(layout))
+        self.assertEqual([roundtripped.virtual_to_physical(x) for x in range(size)], v2p)
+
+        # No changes to `layout`.
+        roundtripped.swap_virtual(0, 1)
+        expected = [5, 3, 1, 2, 0, 4]
+        self.assertEqual([layout.virtual_to_physical(x) for x in range(size)], v2p)
+        self.assertEqual([roundtripped.virtual_to_physical(x) for x in range(size)], expected)
+
+    def test_copy(self):
+        """Test that the layout roundtrips through copy."""
+        v2p = [3, 5, 1, 2, 0, 4]
+        size = len(v2p)
+        layout = NLayout.from_virtual_to_physical(v2p)
+        self.assertEqual([layout.virtual_to_physical(x) for x in range(size)], v2p)
+        roundtripped = copy.copy(layout)
+        self.assertEqual([roundtripped.virtual_to_physical(x) for x in range(size)], v2p)
+
+        # No changes to `layout`.
+        roundtripped.swap_virtual(0, 1)
+        expected = [5, 3, 1, 2, 0, 4]
+        self.assertEqual([layout.virtual_to_physical(x) for x in range(size)], v2p)
+        self.assertEqual([roundtripped.virtual_to_physical(x) for x in range(size)], expected)
+
+    def test_deepcopy(self):
+        """Test that the layout roundtrips through deepcopy."""
+        v2p = [3, 5, 1, 2, 0, 4]
+        size = len(v2p)
+        layout = NLayout.from_virtual_to_physical(v2p)
+        self.assertEqual([layout.virtual_to_physical(x) for x in range(size)], v2p)
+        roundtripped = copy.deepcopy(layout)
+        self.assertEqual([roundtripped.virtual_to_physical(x) for x in range(size)], v2p)
+
+        # No changes to `layout`.
+        roundtripped.swap_virtual(0, 1)
+        expected = [5, 3, 1, 2, 0, 4]
+        self.assertEqual([layout.virtual_to_physical(x) for x in range(size)], v2p)
+        self.assertEqual([roundtripped.virtual_to_physical(x) for x in range(size)], expected)
+
+
 if __name__ == "__main__":
     unittest.main()
diff --git a/test/python/transpiler/test_passmanager_config.py b/test/python/transpiler/test_passmanager_config.py
index fe209e3571ae..01ec7ebf133a 100644
--- a/test/python/transpiler/test_passmanager_config.py
+++ b/test/python/transpiler/test_passmanager_config.py
@@ -93,39 +93,77 @@ def test_str(self):
         pm_config.inst_map = None
         str_out = str(pm_config)
         expected = """Pass Manager Config:
-	initial_layout: None
-	basis_gates: ['h', 'u', 'p', 'u1', 'u2', 'u3', 'rz', 'sx', 'x', 'cx', 'id', 'unitary', 'measure', 'delay', 'reset']
-	inst_map: None
-	coupling_map: None
-	layout_method: None
-	routing_method: None
-	translation_method: None
-	scheduling_method: None
-	instruction_durations: 
-	backend_properties: None
-	approximation_degree: None
-	seed_transpiler: None
-	timing_constraints: None
-	unitary_synthesis_method: default
-	unitary_synthesis_plugin_config: None
-	target: Target: Basic Target
-	Number of qubits: None
-	Instructions:
-		h
-		u
-		p
-		u1
-		u2
-		u3
-		rz
-		sx
-		x
-		cx
-		id
-		unitary
-		measure
-		delay
-		reset
-	
+\tinitial_layout: None
+\tbasis_gates: ['ccx', 'ccz', 'ch', 'cp', 'crx', 'cry', 'crz', 'cs', 'csdg', 'cswap', 'csx', 'cu', 'cu1', 'cu3', 'cx', 'cy', 'cz', 'dcx', 'delay', 'ecr', 'global_phase', 'h', 'id', 'iswap', 'measure', 'p', 'r', 'rccx', 'reset', 'rx', 'rxx', 'ry', 'ryy', 'rz', 'rzx', 'rzz', 's', 'sdg', 'swap', 'sx', 'sxdg', 't', 'tdg', 'u', 'u1', 'u2', 'u3', 'unitary', 'x', 'xx_minus_yy', 'xx_plus_yy', 'y', 'z']
+\tinst_map: None
+\tcoupling_map: None
+\tlayout_method: None
+\trouting_method: None
+\ttranslation_method: None
+\tscheduling_method: None
+\tinstruction_durations:\u0020
+\tbackend_properties: None
+\tapproximation_degree: None
+\tseed_transpiler: None
+\ttiming_constraints: None
+\tunitary_synthesis_method: default
+\tunitary_synthesis_plugin_config: None
+\ttarget: Target: Basic Target
+\tNumber of qubits: None
+\tInstructions:
+\t\tccx
+\t\tccz
+\t\tch
+\t\tcp
+\t\tcrx
+\t\tcry
+\t\tcrz
+\t\tcs
+\t\tcsdg
+\t\tcswap
+\t\tcsx
+\t\tcu
+\t\tcu1
+\t\tcu3
+\t\tcx
+\t\tcy
+\t\tcz
+\t\tdcx
+\t\tdelay
+\t\tecr
+\t\tglobal_phase
+\t\th
+\t\tid
+\t\tiswap
+\t\tmeasure
+\t\tp
+\t\tr
+\t\trccx
+\t\treset
+\t\trx
+\t\trxx
+\t\try
+\t\tryy
+\t\trz
+\t\trzx
+\t\trzz
+\t\ts
+\t\tsdg
+\t\tswap
+\t\tsx
+\t\tsxdg
+\t\tt
+\t\ttdg
+\t\tu
+\t\tu1
+\t\tu2
+\t\tu3
+\t\tunitary
+\t\tx
+\t\txx_minus_yy
+\t\txx_plus_yy
+\t\ty
+\t\tz
+\t
 """
         self.assertEqual(str_out, expected)
diff --git a/test/python/transpiler/test_preset_passmanagers.py b/test/python/transpiler/test_preset_passmanagers.py
index ae1837bf111b..247aa82ec039 100644
--- a/test/python/transpiler/test_preset_passmanagers.py
+++ b/test/python/transpiler/test_preset_passmanagers.py
@@ -324,7 +324,7 @@ def test_backend(self, level):
         qc = QuantumCircuit(qr)
         qc.cx(qr[2], qr[4])
 
-        backend = GenericBackendV2(num_qubits=14, coupling_map=MELBOURNE_CMAP)
+        backend = GenericBackendV2(num_qubits=14, coupling_map=MELBOURNE_CMAP, seed=42)
 
         _ = transpile(qc, backend, optimization_level=level, callback=self.callback)
 
@@ -413,7 +413,7 @@ def get_translation_stage_plugin(self):
                 """Custom post translation stage."""
                 return "custom_stage_for_test"
 
-        target = TargetBackend(num_qubits=7)
+        target = TargetBackend(num_qubits=7, seed=42)
         qr = QuantumRegister(2, "q")
         qc = QuantumCircuit(qr)
         qc.h(qr[0])
@@ -425,7 +425,7 @@ def get_translation_stage_plugin(self):
 
     def test_level1_runs_vf2post_layout_when_routing_required(self):
         """Test that if we run routing as part of sabre layout VF2PostLayout runs."""
-        target = GenericBackendV2(num_qubits=7, coupling_map=LAGOS_CMAP)
+        target = GenericBackendV2(num_qubits=7, coupling_map=LAGOS_CMAP, seed=42)
         qc = QuantumCircuit(5)
         qc.h(0)
         qc.cy(0, 1)
@@ -448,7 +448,7 @@ def test_level1_runs_vf2post_layout_when_routing_required(self):
 
     def test_level1_runs_vf2post_layout_when_routing_method_set_and_required(self):
         """Test that if we run routing as part of sabre layout VF2PostLayout runs."""
-        target = GenericBackendV2(num_qubits=7, coupling_map=LAGOS_CMAP)
+        target = GenericBackendV2(num_qubits=7, coupling_map=LAGOS_CMAP, seed=42)
         qc = QuantumCircuit(5)
         qc.h(0)
         qc.cy(0, 1)
@@ -473,7 +473,10 @@ def test_level1_runs_vf2post_layout_when_routing_method_set_and_required(self):
     def test_level1_not_runs_vf2post_layout_when_layout_method_set(self):
         """Test that if we don't run VF2PostLayout with custom layout_method."""
         target = GenericBackendV2(
-            num_qubits=7, basis_gates=["cx", "id", "rz", "sx", "x"], coupling_map=LAGOS_CMAP
+            num_qubits=7,
+            basis_gates=["cx", "id", "rz", "sx", "x"],
+            coupling_map=LAGOS_CMAP,
+            seed=42,
         )
         qc = QuantumCircuit(5)
         qc.h(0)
@@ -495,7 +498,10 @@ def test_level1_not_runs_vf2post_layout_when_layout_method_set(self):
     def test_level1_not_run_vf2post_layout_when_trivial_is_perfect(self):
         """Test that if we find a trivial perfect layout we don't run vf2post."""
         target = GenericBackendV2(
-            num_qubits=7, basis_gates=["cx", "id", "rz", "sx", "x"], coupling_map=LAGOS_CMAP
+            num_qubits=7,
+            basis_gates=["cx", "id", "rz", "sx", "x"],
+            coupling_map=LAGOS_CMAP,
+            seed=42,
         )
         qc = QuantumCircuit(2)
         qc.h(0)
@@ -512,7 +518,10 @@ def test_level1_not_run_vf2post_layout_when_trivial_is_perfect(self):
     def test_level1_not_run_vf2post_layout_when_vf2layout_is_perfect(self):
         """Test that if we find a vf2 perfect layout we don't run vf2post."""
         target = GenericBackendV2(
-            num_qubits=7, basis_gates=["cx", "id", "rz", "sx", "x"], coupling_map=LAGOS_CMAP
+            num_qubits=7,
+            basis_gates=["cx", "id", "rz", "sx", "x"],
+            coupling_map=LAGOS_CMAP,
+            seed=42,
         )
         qc = QuantumCircuit(4)
         qc.h(0)
@@ -531,7 +540,10 @@ def test_level1_not_run_vf2post_layout_when_vf2layout_is_perfect(self):
     def test_level1_runs_vf2post_layout_when_routing_required_control_flow(self):
         """Test that if we run routing as part of sabre layout VF2PostLayout runs."""
         target = GenericBackendV2(
-            num_qubits=7, basis_gates=["cx", "id", "rz", "sx", "x"], coupling_map=LAGOS_CMAP
+            num_qubits=7,
+            basis_gates=["cx", "id", "rz", "sx", "x"],
+            coupling_map=LAGOS_CMAP,
+            seed=42,
         )
         _target = target.target
         target._target.add_instruction(ForLoopOp, name="for_loop")
@@ -558,7 +570,10 @@ def test_level1_runs_vf2post_layout_when_routing_required_control_flow(self):
     def test_level1_not_runs_vf2post_layout_when_layout_method_set_control_flow(self):
         """Test that if we don't run VF2PostLayout with custom layout_method."""
         target = GenericBackendV2(
-            num_qubits=7, basis_gates=["cx", "id", "rz", "sx", "x"], coupling_map=LAGOS_CMAP
+            num_qubits=7,
+            basis_gates=["cx", "id", "rz", "sx", "x"],
+            coupling_map=LAGOS_CMAP,
+            seed=42,
         )
         _target = target.target
         target._target.add_instruction(ForLoopOp, name="for_loop")
@@ -584,7 +599,10 @@ def test_level1_not_runs_vf2post_layout_when_layout_method_set_control_flow(self
     def test_level1_not_run_vf2post_layout_when_trivial_is_perfect_control_flow(self):
         """Test that if we find a trivial perfect layout we don't run vf2post."""
         target = GenericBackendV2(
-            num_qubits=7, basis_gates=["cx", "id", "rz", "sx", "x"], coupling_map=LAGOS_CMAP
+            num_qubits=7,
+            basis_gates=["cx", "id", "rz", "sx", "x"],
+            coupling_map=LAGOS_CMAP,
+            seed=42,
         )
         _target = target.target
         target._target.add_instruction(ForLoopOp, name="for_loop")
@@ -604,7 +622,10 @@ def test_level1_not_run_vf2post_layout_when_trivial_is_perfect_control_flow(self
     def test_level1_not_run_vf2post_layout_when_vf2layout_is_perfect_control_flow(self):
         """Test that if we find a vf2 perfect layout we don't run vf2post."""
         target = GenericBackendV2(
-            num_qubits=7, basis_gates=["cx", "id", "rz", "sx", "x"], coupling_map=LAGOS_CMAP
+            num_qubits=7,
+            basis_gates=["cx", "id", "rz", "sx", "x"],
+            coupling_map=LAGOS_CMAP,
+            seed=42,
         )
         _target = target.target
         target._target.add_instruction(ForLoopOp, name="for_loop")
@@ -630,7 +651,7 @@ class TestInitialLayouts(QiskitTestCase):
 
     @data(0, 1, 2, 3)
     def test_layout_1711(self, level):
-        """Test that a user-given initial layout is respected,
+        """Test that a user-given initial layout is respected
         in the qobj.
 
         See: https://github.com/Qiskit/qiskit-terra/issues/1711
@@ -661,9 +682,7 @@ def test_layout_1711(self, level):
             14: ancilla[12],
             15: qr[2],
         }
-
-        backend = Fake20QV1()
-        backend.configuration().coupling_map = RUESCHLIKON_CMAP
+        backend = GenericBackendV2(num_qubits=16, coupling_map=RUESCHLIKON_CMAP, seed=42)
         qc_b = transpile(qc, backend, initial_layout=initial_layout, optimization_level=level)
         qobj = assemble(qc_b)
 
@@ -672,7 +691,7 @@ def test_layout_1711(self, level):
         compiled_ops = qobj.experiments[0].instructions
         for operation in compiled_ops:
             if operation.name == "cx":
-                self.assertIn(operation.qubits, backend.configuration().coupling_map)
+                self.assertIn(tuple(operation.qubits), backend.coupling_map)
                 self.assertIn(operation.qubits, [[15, 0], [15, 2]])
 
     @data(0, 1, 2, 3)
@@ -711,10 +730,8 @@ def test_layout_2532(self, level):
             12: ancilla[7],
             13: ancilla[8],
         }
-        backend = Fake20QV1()
-        backend.configuration().coupling_map = MELBOURNE_CMAP
+        backend = GenericBackendV2(num_qubits=14, coupling_map=MELBOURNE_CMAP, seed=42)
         qc_b = transpile(qc, backend, initial_layout=initial_layout, optimization_level=level)
-
         self.assertEqual(qc_b._layout.initial_layout._p2v, final_layout)
 
         output_qr = qc_b.qregs[0]
@@ -766,7 +783,6 @@ def test_layout_2503(self, level):
         }
 
         backend = Fake20QV1()
-
         qc_b = transpile(qc, backend, initial_layout=initial_layout, optimization_level=level)
 
         self.assertEqual(qc_b._layout.initial_layout._p2v, final_layout)
@@ -796,50 +812,51 @@ def test_layout_tokyo_2845(self, level):
         qc.cx(qr1[2], qr2[0])
         qc.cx(qr2[0], qr2[1])
 
+        ancilla = QuantumRegister(15, "ancilla")
         trivial_layout = {
-            0: Qubit(QuantumRegister(3, "qr1"), 0),
-            1: Qubit(QuantumRegister(3, "qr1"), 1),
-            2: Qubit(QuantumRegister(3, "qr1"), 2),
-            3: Qubit(QuantumRegister(2, "qr2"), 0),
-            4: Qubit(QuantumRegister(2, "qr2"), 1),
-            5: Qubit(QuantumRegister(15, "ancilla"), 0),
-            6: Qubit(QuantumRegister(15, "ancilla"), 1),
-            7: Qubit(QuantumRegister(15, "ancilla"), 2),
-            8: Qubit(QuantumRegister(15, "ancilla"), 3),
-            9: Qubit(QuantumRegister(15, "ancilla"), 4),
-            10: Qubit(QuantumRegister(15, "ancilla"), 5),
-            11: Qubit(QuantumRegister(15, "ancilla"), 6),
-            12: Qubit(QuantumRegister(15, "ancilla"), 7),
-            13: Qubit(QuantumRegister(15, "ancilla"), 8),
-            14: Qubit(QuantumRegister(15, "ancilla"), 9),
-            15: Qubit(QuantumRegister(15, "ancilla"), 10),
-            16: Qubit(QuantumRegister(15, "ancilla"), 11),
-            17: Qubit(QuantumRegister(15, "ancilla"), 12),
-            18: Qubit(QuantumRegister(15, "ancilla"), 13),
-            19: Qubit(QuantumRegister(15, "ancilla"), 14),
+            0: qr1[0],
+            1: qr1[1],
+            2: qr1[2],
+            3: qr2[0],
+            4: qr2[1],
+            5: ancilla[0],
+            6: ancilla[1],
+            7: ancilla[2],
+            8: ancilla[3],
+            9: ancilla[4],
+            10: ancilla[5],
+            11: ancilla[6],
+            12: ancilla[7],
+            13: ancilla[8],
+            14: ancilla[9],
+            15: ancilla[10],
+            16: ancilla[11],
+            17: ancilla[12],
+            18: ancilla[13],
+            19: ancilla[14],
         }
 
         vf2_layout = {
-            0: Qubit(QuantumRegister(15, "ancilla"), 0),
-            1: Qubit(QuantumRegister(15, "ancilla"), 1),
-            2: Qubit(QuantumRegister(15, "ancilla"), 2),
-            3: Qubit(QuantumRegister(15, "ancilla"), 3),
-            4: Qubit(QuantumRegister(15, "ancilla"), 4),
-            5: Qubit(QuantumRegister(15, "ancilla"), 5),
-            6: Qubit(QuantumRegister(15, "ancilla"), 6),
-            7: Qubit(QuantumRegister(15, "ancilla"), 7),
-            8: Qubit(QuantumRegister(3, "qr1"), 1),
-            9: Qubit(QuantumRegister(15, "ancilla"), 8),
-            10: Qubit(QuantumRegister(15, "ancilla"), 9),
-            11: Qubit(QuantumRegister(15, "ancilla"), 10),
-            12: Qubit(QuantumRegister(3, "qr1"), 0),
-            13: Qubit(QuantumRegister(3, "qr1"), 2),
-            14: Qubit(QuantumRegister(2, "qr2"), 1),
-            15: Qubit(QuantumRegister(15, "ancilla"), 11),
-            16: Qubit(QuantumRegister(15, "ancilla"), 12),
-            17: Qubit(QuantumRegister(15, "ancilla"), 13),
-            18: Qubit(QuantumRegister(15, "ancilla"), 14),
-            19: Qubit(QuantumRegister(2, "qr2"), 0),
+            0: ancilla[0],
+            1: ancilla[1],
+            2: ancilla[2],
+            3: ancilla[3],
+            4: ancilla[4],
+            5: qr1[2],
+            6: qr2[0],
+            7: qr2[1],
+            8: ancilla[5],
+            9: ancilla[6],
+            10: qr1[1],
+            11: qr1[0],
+            12: ancilla[7],
+            13: ancilla[8],
+            14: ancilla[9],
+            15: ancilla[10],
+            16: ancilla[11],
+            17: ancilla[12],
+            18: ancilla[13],
+            19: ancilla[14],
         }
 
         # Trivial layout
@@ -856,8 +873,8 @@ def test_layout_tokyo_2845(self, level):
             expected_layout_level2,
             expected_layout_level3,
         ]
-        backend = Fake20QV1()
-        backend.configuration().coupling_map = TOKYO_CMAP
+
+        backend = GenericBackendV2(num_qubits=20, coupling_map=TOKYO_CMAP, seed=42)
         result = transpile(qc, backend, optimization_level=level, seed_transpiler=42)
         self.assertEqual(result._layout.initial_layout._p2v, expected_layouts[level])
 
@@ -904,64 +921,18 @@ def test_layout_tokyo_fully_connected_cx(self, level):
             2: ancilla[2],
             3: ancilla[3],
             4: ancilla[4],
-            5: qr[2],
-            6: qr[1],
-            7: ancilla[6],
-            8: ancilla[7],
-            9: ancilla[8],
-            10: qr[3],
-            11: qr[0],
-            12: ancilla[9],
-            13: ancilla[10],
-            14: ancilla[11],
-            15: ancilla[5],
-            16: qr[4],
-            17: ancilla[12],
-            18: ancilla[13],
-            19: ancilla[14],
-        }
-
-        sabre_layout_lvl_2 = {
-            0: ancilla[0],
-            1: ancilla[1],
-            2: ancilla[2],
-            3: ancilla[3],
-            4: ancilla[4],
-            5: qr[2],
-            6: qr[1],
-            7: ancilla[6],
-            8: ancilla[7],
-            9: ancilla[8],
-            10: qr[3],
-            11: qr[0],
-            12: ancilla[9],
-            13: ancilla[10],
-            14: ancilla[11],
-            15: ancilla[5],
-            16: qr[4],
-            17: ancilla[12],
-            18: ancilla[13],
-            19: ancilla[14],
-        }
-
-        sabre_layout_lvl_3 = {
-            0: ancilla[0],
-            1: ancilla[1],
-            2: ancilla[2],
-            3: ancilla[3],
-            4: ancilla[4],
-            5: qr[2],
-            6: qr[1],
-            7: ancilla[6],
-            8: ancilla[7],
-            9: ancilla[8],
-            10: qr[3],
-            11: qr[0],
-            12: ancilla[9],
-            13: ancilla[10],
-            14: ancilla[11],
-            15: ancilla[5],
-            16: qr[4],
+            5: qr[1],
+            6: qr[0],
+            7: qr[4],
+            8: ancilla[6],
+            9: ancilla[7],
+            10: qr[2],
+            11: qr[3],
+            12: ancilla[5],
+            13: ancilla[8],
+            14: ancilla[9],
+            15: ancilla[10],
+            16: ancilla[11],
             17: ancilla[12],
             18: ancilla[13],
             19: ancilla[14],
@@ -969,8 +940,8 @@ def test_layout_tokyo_fully_connected_cx(self, level):
 
         expected_layout_level0 = trivial_layout
         expected_layout_level1 = sabre_layout
-        expected_layout_level2 = sabre_layout_lvl_2
-        expected_layout_level3 = sabre_layout_lvl_3
+        expected_layout_level2 = sabre_layout
+        expected_layout_level3 = sabre_layout
 
         expected_layouts = [
             expected_layout_level0,
@@ -978,9 +949,7 @@ def test_layout_tokyo_fully_connected_cx(self, level):
             expected_layout_level2,
             expected_layout_level3,
         ]
-        backend = Fake20QV1()
-        backend.configuration().coupling_map = TOKYO_CMAP
-
+        backend = GenericBackendV2(num_qubits=20, coupling_map=TOKYO_CMAP, seed=42)
         result = transpile(qc, backend, optimization_level=level, seed_transpiler=42)
         self.assertEqual(result._layout.initial_layout._p2v, expected_layouts[level])
 
@@ -991,12 +960,10 @@ def test_all_levels_use_trivial_if_perfect(self, level):
         See: https://github.com/Qiskit/qiskit-terra/issues/5694 for more
         details
         """
-        backend = Fake20QV1()
-        backend.configuration().coupling_map = TOKYO_CMAP
-        config = backend.configuration()
+        backend = GenericBackendV2(num_qubits=20, coupling_map=TOKYO_CMAP, seed=42)
 
-        rows = [x[0] for x in config.coupling_map]
-        cols = [x[1] for x in config.coupling_map]
+        rows = [x[0] for x in backend.coupling_map]
+        cols = [x[1] for x in backend.coupling_map]
 
         adjacency_matrix = np.zeros((20, 20))
         adjacency_matrix[rows, cols] = 1
@@ -1255,7 +1222,7 @@ def test_with_backend(self, optimization_level):
     @data(0, 1, 2, 3)
     def test_with_no_backend(self, optimization_level):
         """Test a passmanager is constructed with no backend and optimization level."""
-        target = GenericBackendV2(num_qubits=7, coupling_map=LAGOS_CMAP)
+        target = GenericBackendV2(num_qubits=7, coupling_map=LAGOS_CMAP, seed=42)
         pm = generate_preset_pass_manager(
             optimization_level,
             coupling_map=target.coupling_map,
@@ -1270,7 +1237,7 @@ def test_with_no_backend(self, optimization_level):
     @data(0, 1, 2, 3)
     def test_with_no_backend_only_target(self, optimization_level):
         """Test a passmanager is constructed with a manual target and optimization level."""
-        target = GenericBackendV2(num_qubits=7, coupling_map=LAGOS_CMAP)
+        target = GenericBackendV2(num_qubits=7, coupling_map=LAGOS_CMAP, seed=42)
         pm = generate_preset_pass_manager(optimization_level, target=target.target)
         self.assertIsInstance(pm, PassManager)
 
@@ -1299,7 +1266,7 @@ def get_translation_stage_plugin(self):
                 """Custom post translation stage."""
                 return "custom_stage_for_test"
 
-        target = TargetBackend(num_qubits=7, coupling_map=LAGOS_CMAP)
+        target = TargetBackend(num_qubits=7, coupling_map=LAGOS_CMAP, seed=42)
         pm = generate_preset_pass_manager(optimization_level, backend=target)
         self.assertIsInstance(pm, PassManager)
 
@@ -1331,7 +1298,7 @@ def get_translation_stage_plugin(self):
                 """Custom post translation stage."""
                 return "custom_stage_for_test"
 
-        target = TargetBackend(num_qubits=7, coupling_map=LAGOS_CMAP)
+        target = TargetBackend(num_qubits=7, coupling_map=LAGOS_CMAP, seed=42)
         pm = generate_preset_pass_manager(optimization_level, backend=target)
         self.assertIsInstance(pm, PassManager)
 
@@ -1363,7 +1330,7 @@ def get_translation_stage_plugin(self):
                 """Custom post translation stage."""
                 return "custom_stage_for_test"
 
-        target = TargetBackend(num_qubits=7, coupling_map=LAGOS_CMAP)
+        target = TargetBackend(num_qubits=7, coupling_map=LAGOS_CMAP, seed=42)
         pm = generate_preset_pass_manager(optimization_level, backend=target)
         self.assertIsInstance(pm, PassManager)
 
@@ -1395,7 +1362,7 @@ def get_translation_stage_plugin(self):
                 """Custom post translation stage."""
                 return "custom_stage_for_test"
 
-        target = TargetBackend(num_qubits=7, coupling_map=LAGOS_CMAP)
+        target = TargetBackend(num_qubits=7, coupling_map=LAGOS_CMAP, seed=42)
         pm = generate_preset_pass_manager(optimization_level, backend=target)
         self.assertIsInstance(pm, PassManager)
 
diff --git a/test/python/transpiler/test_pulse_gate_pass.py b/test/python/transpiler/test_pulse_gate_pass.py
index 8de8ceb66ef8..a11d4c4a6b53 100644
--- a/test/python/transpiler/test_pulse_gate_pass.py
+++ b/test/python/transpiler/test_pulse_gate_pass.py
@@ -16,6 +16,7 @@
 
 from qiskit import pulse, circuit, transpile
 from qiskit.providers.fake_provider import Fake27QPulseV1, GenericBackendV2
+from qiskit.providers.models import GateConfig
 from qiskit.quantum_info.random import random_unitary
 from test import QiskitTestCase  # pylint: disable=wrong-import-order
 
@@ -185,6 +186,12 @@ def test_transpile_with_custom_gate(self):
         backend.defaults().instruction_schedule_map.add(
             "my_gate", (1,), self.my_gate_q1, arguments=["P0"]
         )
+        # Add gate to backend configuration
+        backend.configuration().basis_gates.append("my_gate")
+        dummy_config = GateConfig(
+            name="my_gate", parameters=[], qasm_def="", coupling_map=[(0,), (1,)]
+        )
+        backend.configuration().gates.append(dummy_config)
         # Remove timing constraints to avoid triggering
         # scheduling passes.
         backend.configuration().timing_constraints = {}
@@ -212,6 +219,10 @@ def test_transpile_with_parameterized_custom_gate(self):
         backend.defaults().instruction_schedule_map.add(
             "my_gate", (0,), self.my_gate_q0, arguments=["P0"]
         )
+        # Add gate to backend configuration
+        backend.configuration().basis_gates.append("my_gate")
+        dummy_config = GateConfig(name="my_gate", parameters=[], qasm_def="", coupling_map=[(0,)])
+        backend.configuration().gates.append(dummy_config)
         # Remove timing constraints to avoid triggering
         # scheduling passes.
         backend.configuration().timing_constraints = {}
@@ -237,6 +248,10 @@ def test_transpile_with_multiple_circuits(self):
         backend.defaults().instruction_schedule_map.add(
             "my_gate", (0,), self.my_gate_q0, arguments=["P0"]
         )
+        # Add gate to backend configuration
+        backend.configuration().basis_gates.append("my_gate")
+        dummy_config = GateConfig(name="my_gate", parameters=[], qasm_def="", coupling_map=[(0,)])
+        backend.configuration().gates.append(dummy_config)
         # Remove timing constraints to avoid triggering
         # scheduling passes.
         backend.configuration().timing_constraints = {}
@@ -263,6 +278,10 @@ def test_multiple_instructions_with_different_parameters(self):
         backend.defaults().instruction_schedule_map.add(
             "my_gate", (0,), self.my_gate_q0, arguments=["P0"]
         )
+        # Add gate to backend configuration
+        backend.configuration().basis_gates.append("my_gate")
+        dummy_config = GateConfig(name="my_gate", parameters=[], qasm_def="", coupling_map=[(0,)])
+        backend.configuration().gates.append(dummy_config)
         # Remove timing constraints to avoid triggering
         # scheduling passes.
         backend.configuration().timing_constraints = {}
diff --git a/test/python/visualization/test_circuit_text_drawer.py b/test/python/visualization/test_circuit_text_drawer.py
index 9b34257f567c..5f72a7d1bbbc 100644
--- a/test/python/visualization/test_circuit_text_drawer.py
+++ b/test/python/visualization/test_circuit_text_drawer.py
@@ -12,6 +12,9 @@
 
 """circuit_drawer with output="text" draws a circuit in ascii art"""
 
+# Sometimes we want to test long-lined output.
+# pylint: disable=line-too-long
+
 import pathlib
 import os
 import tempfile
@@ -37,7 +40,7 @@
 from qiskit.visualization import circuit_drawer
 from qiskit.visualization.circuit import text as elements
 from qiskit.providers.fake_provider import GenericBackendV2
-from qiskit.circuit.classical import expr
+from qiskit.circuit.classical import expr, types
 from qiskit.circuit.library import (
     HGate,
     U2Gate,
@@ -6316,6 +6319,80 @@ def test_switch_with_expression(self):
             expected,
         )
 
+    def test_nested_if_else_op_var(self):
+        """Test if/else with standalone Var."""
+        expected = "\n".join(
+            [
+                "     ┌───────── ┌────────────────       ───────┐ ┌──────────────────── ┌───┐ ───────┐  ───────┐ ",
+                "q_0: ┤          ┤                 ──■──        ├─┤ If-1 c && a == 128  ┤ H ├  End-1 ├─        ├─",
+                "     │ If-0 !b  │ If-1 b == c[0]  ┌─┴─┐  End-1 │ └──────────────────── └───┘ ───────┘   End-0 │ ",
+                "q_1: ┤          ┤                 ┤ X ├        ├──────────────────────────────────────        ├─",
+                "     └───────── └───────╥──────── └───┘ ───────┘                                       ───────┘ ",
+                "                    ┌───╨────┐                                                                  ",
+                "c: 2/═══════════════╡ [expr] ╞══════════════════════════════════════════════════════════════════",
+                "                    └────────┘                                                                  ",
+            ]
+        )
+        a = expr.Var.new("a", types.Uint(8))
+        qc = QuantumCircuit(2, 2, inputs=[a])
+        b = qc.add_var("b", False)
+        qc.store(a, 128)
+        with qc.if_test(expr.logic_not(b)):
+            # Mix old-style and new-style.
+            with qc.if_test(expr.equal(b, qc.clbits[0])):
+                qc.cx(0, 1)
+            c = qc.add_var("c", b)
+            with qc.if_test(expr.logic_and(c, expr.equal(a, 128))):
+                qc.h(0)
+
+        actual = str(qc.draw("text", fold=-1, initial_state=False))
+        self.assertEqual(actual, expected)
+
+    def test_nested_switch_op_var(self):
+        """Test switch with standalone Var."""
+        expected = "\n".join(
+            [
+                "     ┌───────────── ┌──────────── ┌──────────── ┌────────────      »",
+                "q_0: ┤              ┤             ┤             ┤             ──■──»",
+                "     │ Switch-0 ~a  │ Case-0 (0)  │ Switch-1 b  │ Case-1 (2)  ┌─┴─┐»",
+                "q_1: ┤              ┤             ┤             ┤             ┤ X ├»",
+                "     └───────────── └──────────── └──────────── └──────────── └───┘»",
+                "c: 2/══════════════════════════════════════════════════════════════»",
+                "                                                                   »",
+                "«     ┌──────────────── ┌───┐ ───────┐ ┌──────────────── ┌──────── ┌───┐»",
+                "«q_0: ┤                 ┤ X ├        ├─┤                 ┤ If-1 c  ┤ H ├»",
+                "«     │ Case-1 default  └─┬─┘  End-1 │ │ Case-0 default  └──────── └───┘»",
+                "«q_1: ┤                 ──■──        ├─┤                 ───────────────»",
+                "«     └────────────────       ───────┘ └────────────────                »",
+                "«c: 2/══════════════════════════════════════════════════════════════════»",
+                "«                                                                       »",
+                "«      ───────┐  ───────┐ ",
+                "«q_0:   End-1 ├─        ├─",
+                "«      ───────┘   End-0 │ ",
+                "«q_1: ──────────        ├─",
+                "«                ───────┘ ",
+                "«c: 2/════════════════════",
+                "«                         ",
+            ]
+        )
+
+        a = expr.Var.new("a", types.Uint(8))
+        qc = QuantumCircuit(2, 2, inputs=[a])
+        b = qc.add_var("b", expr.lift(5, a.type))
+        with qc.switch(expr.bit_not(a)) as case:
+            with case(0):
+                with qc.switch(b) as case2:
+                    with case2(2):
+                        qc.cx(0, 1)
+                    with case2(case2.DEFAULT):
+                        qc.cx(1, 0)
+            with case(case.DEFAULT):
+                c = qc.add_var("c", expr.equal(a, b))
+                with qc.if_test(c):
+                    qc.h(0)
+        actual = str(qc.draw("text", fold=80, initial_state=False))
+        self.assertEqual(actual, expected)
+
 
 class TestCircuitAnnotatedOperations(QiskitVisualizationTestCase):
     """Test AnnotatedOperations and other non-Instructions."""
diff --git a/test/qpy_compat/test_qpy.py b/test/qpy_compat/test_qpy.py
index 345d9dc0a447..58ee1abc2a2f 100755
--- a/test/qpy_compat/test_qpy.py
+++ b/test/qpy_compat/test_qpy.py
@@ -754,6 +754,54 @@ def generate_control_flow_expr():
     return [qc1, qc2, qc3, qc4]
 
 
+def generate_standalone_var():
+    """Circuits that use standalone variables."""
+    import uuid
+    from qiskit.circuit.classical import expr, types
+
+    # This is the low-level, non-preferred way to construct variables, but we need the UUIDs to be
+    # deterministic between separate invocations of the script.
+    uuids = [
+        uuid.UUID(bytes=b"hello, qpy world", version=4),
+        uuid.UUID(bytes=b"not a good uuid4", version=4),
+        uuid.UUID(bytes=b"but it's ok here", version=4),
+        uuid.UUID(bytes=b"any old 16 bytes", version=4),
+        uuid.UUID(bytes=b"and another load", version=4),
+    ]
+    a = expr.Var(uuids[0], types.Bool(), name="a")
+    b = expr.Var(uuids[1], types.Bool(), name="θψφ")
+    b_other = expr.Var(uuids[2], types.Bool(), name=b.name)
+    c = expr.Var(uuids[3], types.Uint(8), name="🐍🐍🐍")
+    d = expr.Var(uuids[4], types.Uint(8), name="d")
+
+    qc = QuantumCircuit(1, 1, inputs=[a], name="standalone_var")
+    qc.add_var(b, expr.logic_not(a))
+
+    qc.add_var(c, expr.lift(0, c.type))
+    with qc.if_test(b) as else_:
+        qc.store(c, expr.lift(3, c.type))
+        with qc.while_loop(b):
+            qc.add_var(c, expr.lift(7, c.type))
+    with else_:
+        qc.add_var(d, expr.lift(7, d.type))
+
+    qc.measure(0, 0)
+    with qc.switch(c) as case:
+        with case(0):
+            qc.store(b, True)
+        with case(1):
+            qc.store(qc.clbits[0], False)
+        with case(2):
+            # Explicit shadowing.
+            qc.add_var(b_other, True)
+        with case(3):
+            qc.store(a, False)
+        with case(case.DEFAULT):
+            pass
+
+    return [qc]
+
+
 def generate_circuits(version_parts):
     """Generate reference circuits."""
     output_circuits = {
@@ -802,6 +850,8 @@ def generate_circuits(version_parts):
         output_circuits["clifford.qpy"] = generate_clifford_circuits()
     if version_parts >= (1, 0, 0):
         output_circuits["annotated.qpy"] = generate_annotated_circuits()
+    if version_parts >= (1, 1, 0):
+        output_circuits["standalone_vars.qpy"] = generate_standalone_var()
     return output_circuits
 
 
diff --git a/test/visual/mpl/circuit/references/if_else_standalone_var.png b/test/visual/mpl/circuit/references/if_else_standalone_var.png
new file mode 100644
index 000000000000..6266a0caeb00
Binary files /dev/null and b/test/visual/mpl/circuit/references/if_else_standalone_var.png differ
diff --git a/test/visual/mpl/circuit/references/switch_standalone_var.png b/test/visual/mpl/circuit/references/switch_standalone_var.png
new file mode 100644
index 000000000000..8b8c78828917
Binary files /dev/null and b/test/visual/mpl/circuit/references/switch_standalone_var.png differ
diff --git a/test/visual/mpl/circuit/test_circuit_matplotlib_drawer.py b/test/visual/mpl/circuit/test_circuit_matplotlib_drawer.py
index e99cb3f628d8..9e3dd5cc48eb 100644
--- a/test/visual/mpl/circuit/test_circuit_matplotlib_drawer.py
+++ b/test/visual/mpl/circuit/test_circuit_matplotlib_drawer.py
@@ -47,7 +47,7 @@
 )
 from qiskit.circuit import Parameter, Qubit, Clbit, IfElseOp, SwitchCaseOp
 from qiskit.circuit.library import IQP
-from qiskit.circuit.classical import expr
+from qiskit.circuit.classical import expr, types
 from qiskit.quantum_info import random_clifford
 from qiskit.quantum_info.random import random_unitary
 from qiskit.utils import optionals
@@ -2300,6 +2300,59 @@ def test_no_qreg_names_after_layout(self):
         )
         self.assertGreaterEqual(ratio, self.threshold)
 
+    def test_if_else_standalone_var(self):
+        """Test if/else with standalone Var."""
+        a = expr.Var.new("a", types.Uint(8))
+        qc = QuantumCircuit(2, 2, inputs=[a])
+        b = qc.add_var("b", False)
+        qc.store(a, 128)
+        with qc.if_test(expr.logic_not(b)):
+            # Mix old-style and new-style.
+            with qc.if_test(expr.equal(b, qc.clbits[0])):
+                qc.cx(0, 1)
+            c = qc.add_var("c", b)
+            with qc.if_test(expr.logic_and(c, expr.equal(a, 128))):
+                qc.h(0)
+        fname = "if_else_standalone_var.png"
+        self.circuit_drawer(qc, output="mpl", filename=fname)
+
+        ratio = VisualTestUtilities._save_diff(
+            self._image_path(fname),
+            self._reference_path(fname),
+            fname,
+            FAILURE_DIFF_DIR,
+            FAILURE_PREFIX,
+        )
+        self.assertGreaterEqual(ratio, self.threshold)
+
+    def test_switch_standalone_var(self):
+        """Test switch with standalone Var."""
+        a = expr.Var.new("a", types.Uint(8))
+        qc = QuantumCircuit(2, 2, inputs=[a])
+        b = qc.add_var("b", expr.lift(5, a.type))
+        with qc.switch(expr.bit_not(a)) as case:
+            with case(0):
+                with qc.switch(b) as case2:
+                    with case2(2):
+                        qc.cx(0, 1)
+                    with case2(case2.DEFAULT):
+                        qc.cx(1, 0)
+            with case(case.DEFAULT):
+                c = qc.add_var("c", expr.equal(a, b))
+                with qc.if_test(c):
+                    qc.h(0)
+        fname = "switch_standalone_var.png"
+        self.circuit_drawer(qc, output="mpl", filename=fname)
+
+        ratio = VisualTestUtilities._save_diff(
+            self._image_path(fname),
+            self._reference_path(fname),
+            fname,
+            FAILURE_DIFF_DIR,
+            FAILURE_PREFIX,
+        )
+        self.assertGreaterEqual(ratio, self.threshold)
+
 
 if __name__ == "__main__":
     unittest.main(verbosity=1)
diff --git a/tools/build_pgo.sh b/tools/build_pgo.sh
index d0e88bf6f745..8553691bdfe4 100755
--- a/tools/build_pgo.sh
+++ b/tools/build_pgo.sh
@@ -17,6 +17,12 @@ else
     source build_pgo/bin/activate
 fi
 
+arch=`uname -m`
+# Handle macOS calling the architecture arm64 and rust calling it aarch64
+if [[ $arch == "arm64" ]]; then
+    arch="aarch64"
+fi
+
 # Build with instrumentation
 pip install -U -c constraints.txt setuptools-rust wheel setuptools
 RUSTFLAGS="-Cprofile-generate=/tmp/pgo-data" pip install --prefer-binary -c constraints.txt -r requirements-dev.txt -e .
@@ -29,4 +35,4 @@ python tools/pgo_scripts/test_utility_scale.py
 
 deactivate
 
-${HOME}/.rustup/toolchains/*x86_64*/lib/rustlib/x86_64*/bin/llvm-profdata merge -o $merged_path /tmp/pgo-data
+${HOME}/.rustup/toolchains/*$arch*/lib/rustlib/$arch*/bin/llvm-profdata merge -o $merged_path /tmp/pgo-data