Skip to content

Commit

Permalink
Linfile cleaning keep bpms and limit-based cleaning (#439)
Browse files Browse the repository at this point in the history
* Implementation and test
* Changelog and version bump
  • Loading branch information
JoschD authored Mar 18, 2024
1 parent e42b082 commit 2cf93e1
Show file tree
Hide file tree
Showing 4 changed files with 180 additions and 13 deletions.
14 changes: 14 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,19 @@
# OMC3 Changelog

#### 2024-03-18 - v0.14.0 - _jdilly_

- Added:
- Linfile Updater: `keep`-flag to keep names and option to clean manually between limits.

#### 2024-03-08 - v0.13.1 - _jdilly_, _awegsche_, _mlegarre_, _fesoubel_

- Added:
- Knob Extractor: Lumiscan Knob

- Fixed:
- BBS converter: fixed closed orbit units
- Optics: Pandas indexing error in DPP

#### 2023-12-07 - v0.13.0 - _awegsche_

- Added:
Expand Down
2 changes: 1 addition & 1 deletion omc3/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@
__title__ = "omc3"
__description__ = "An accelerator physics tools package for the OMC team at CERN."
__url__ = "https://github.com/pylhc/omc3"
__version__ = "0.13.1"
__version__ = "0.14.0"
__author__ = "pylhc"
__author_email__ = "[email protected]"
__license__ = "MIT"
Expand Down
81 changes: 71 additions & 10 deletions omc3/scripts/linfile_clean.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,10 @@
Performs an automated cleaning of different columns in the lin-file
as a standalone script to allow for manual refinement after harpy is done.
The type of cleaning is determined by the number of values in the ``limit``
parameter. When no ``limit`` is given or a single number is given,
auto-cleaning is performed:
All data is assumed to be gaussian distributed around a "true" value,
and outliers are cleaned by calculating average and standard deviation
of the given data.
Expand All @@ -17,6 +21,9 @@
cleaned. The limit is given in whatever units the data itself is in and
is an absolute value.
If two values are given for the ``limit`` parameter, all data-points in between
these limits are kept and all data-points outside of these limits are cleaned.
Cleaning is done per given file independently
i.e. removed BPMs from one file can be present in the next.
The columns are iterated on the same file, i.e. the cleaning is done
Expand Down Expand Up @@ -53,10 +60,15 @@
Columns to clean on.
- **keep** *(str)*:
Do not clean BPMs with given names.
- **limit** *(float)*:
Do not clean data-points deviating less than this limit from the
average.
Two values: Do not clean data-points in between these values.
Single value (auto-clean): Do not clean data-points deviating less than this limit from the average.
default: ``0.0``
Expand Down Expand Up @@ -117,8 +129,14 @@ def get_params():
),
limit=dict(
type=float,
help="Do not clean data-points deviating less than this limit from the average.",
default=0.0,
nargs='+',
help="Two values: Do not clean data-points in between these values. "
"Single value (auto-clean): Do not clean data-points deviating less than this limit from the average.",
),
keep=dict(
nargs='+',
type=str,
help="Do not clean BPMs with given names.",
),
backup=dict(
action="store_true",
Expand All @@ -140,7 +158,7 @@ def main(opt):

if opt.columns is None:
raise ValueError("The option 'columns' is required for cleaning.")
clean_columns(opt.files, opt.columns, opt.limit, opt.backup)
clean_columns(opt.files, opt.columns, opt.limit, opt.keep, opt.backup)


# Restore ----------------------------------------------------------------------
Expand Down Expand Up @@ -184,15 +202,30 @@ def _restore_file(file):

# Clean ------------------------------------------------------------------------

def clean_columns(files: Sequence[Union[Path, str]], columns: Sequence[str],
limit: float = 0.0, backup: bool = True):
def clean_columns(files: Sequence[Union[Path, str]],
columns: Sequence[str],
limit: float = None, # default set in _check_limits
keep: Sequence[str] = None, # default set below
backup: bool = True):
""" Clean the columns in the given files."""
for file in files:
file = Path(file)
LOG.info(f"Cleaning {file.name}.")

# check limits
limit = _check_limits(limit)

# read and check file
df = tfs.read_tfs(file, index=COL_NAME)
if keep is None:
keep = ()
not_found_bpms = set(keep) - set(df.index)
if len(not_found_bpms):
LOG.warning(f"The following BPMs to keep were not found in {file.name}:\n{not_found_bpms}")

# clean
for column in columns:
df = _filter_by_column(df, column, limit)
df = _filter_by_column(df, column, limit, keep)
df.headers.update(_compute_headers(df))

if backup:
Expand All @@ -201,13 +234,41 @@ def clean_columns(files: Sequence[Union[Path, str]], columns: Sequence[str],
tfs.write_tfs(file, df, save_index=COL_NAME)


def _filter_by_column(df: pd.DataFrame, column: str, limit: Number) -> pd.DataFrame:
def _check_limits(limit: Union[Sequence[Number], Number]) -> Sequence[Number]:
""" Check that one or two limits are given and convert them into a tuple if needed."""
if limit is None:
limit = (0.0,)

try:
len(limit)
except TypeError:
limit = (limit,)

if len(limit) == 1:
LOG.info("Performing auto-cleaning.")

elif len(limit) == 2:
LOG.info(f"Performing cleaning between the limits {limit}.")
limit = tuple(sorted(limit))

else:
raise ValueError(f"Expected 1 or 2 limits, got {len(limit)}.")

return limit


def _filter_by_column(df: pd.DataFrame, column: str, limit: Sequence[Number], keep: Sequence[str]) -> pd.DataFrame:
"""Get the dataframe with all rows dropped filtered by the given column."""
if column not in df.columns:
LOG.info(f"{column} not in current file. Skipping cleaning.")
return df

good_bpms = get_filter_mask(data=df[column], limit=limit)
keep_bpms = df.index.isin(keep)
if len(limit) == 1:
good_bpms = get_filter_mask(data=df[column], limit=limit[0]) | keep_bpms
else:
good_bpms = df[column].between(*limit) | keep_bpms

n_good, n_total = sum(good_bpms), len(good_bpms)
LOG.info(f"Cleaned {n_total-n_good:d} of {n_total:d} elements in {column} ({n_good:d} remaining).")
return df.loc[good_bpms, :]
Expand Down
96 changes: 94 additions & 2 deletions tests/unit/test_linfile_clean.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@ def test_filter_tune(tmp_path):

for plane in PLANES:
assert len(filtered[plane]) == 4
assert unfiltered[plane][COL_NAME][2] not in filtered[plane][COL_NAME]
assert unfiltered[plane][COL_NAME][2] not in filtered[plane][COL_NAME].to_list()


def test_filter_tune_limit(tmp_path):
Expand All @@ -48,6 +48,49 @@ def test_filter_tune_limit(tmp_path):
assert_frame_equal(unfiltered[plane], filtered[plane])


@pytest.mark.basic
def test_keep_bpms(tmp_path):
""" Test that keeping BPMs works. """
columns = [COL_TUNE]
plane_columns = [f"{col}{p}" for col in columns for p in PLANES]

# To be filtered BPMS are (due to the values in the example linfiles)
filtered_bpms = {
"X": ["BPM.10L4.B1", "BPM.10L2.B1"],
"Y": ["BPM.10L1.B1", "BPM.10L2.B1"],
}

# Test that all BPMs are filtered without the keep-flag --------------------
linfiles = _copy_and_modify_linfiles(tmp_path, columns=columns)
unfiltered = {p: tfs.read(f) for p, f in linfiles.items()}

# if limit not given, filters two elements in X
clean_columns(files=linfiles.values(), columns=plane_columns)
filtered = {p: tfs.read(f) for p, f in linfiles.items()}


for plane in PLANES:
assert len(filtered[plane]) == len(unfiltered[plane]) - 2
for bpm in filtered_bpms[plane]:
assert bpm not in filtered[plane][COL_NAME].to_list()
assert bpm in unfiltered[plane][COL_NAME].to_list()

# Now with keeping one of them ---------------------------------------------
linfiles = _copy_and_modify_linfiles(tmp_path, columns=columns)
unfiltered = {p: tfs.read(f) for p, f in linfiles.items()}

# if limit not given, filters two elements in X
clean_columns(files=linfiles.values(), columns=plane_columns, keep=[filtered_bpms["X"][1]])
filtered = {p: tfs.read(f) for p, f in linfiles.items()}
for plane in PLANES:
assert len(filtered[plane]) == len(unfiltered[plane]) - 1
for bpm in filtered_bpms[plane]:
assert bpm in unfiltered[plane][COL_NAME].to_list()

assert filtered_bpms[plane][0] not in filtered[plane][COL_NAME].to_list()
assert filtered_bpms[plane][1] in filtered[plane][COL_NAME].to_list()


@pytest.mark.basic
def test_filter_tune_nattune(tmp_path):
"""Tests that filtering works for two columns."""
Expand All @@ -64,6 +107,55 @@ def test_filter_tune_nattune(tmp_path):
assert len(filtered[plane]) == 2 # empirically determined


@pytest.mark.basic
def test_filter_between_limits(tmp_path):
""" Test filtering works on outlier created by modify linfiles function. """
columns = [COL_TUNE]
plane_columns = [f"{col}{p}" for col in columns for p in PLANES]

# Test that no BPMs are filtered by the auto-clean (sanity check) ----------
linfiles = _copy_and_modify_linfiles(tmp_path, columns=columns, index=[2, 3], by=0.1)
unfiltered = {p: tfs.read(f) for p, f in linfiles.items()}

clean_columns(files=linfiles.values(), columns=plane_columns)

filtered = {p: tfs.read(f) for p, f in linfiles.items()}

for plane in PLANES:
assert_frame_equal(unfiltered[plane], filtered[plane])

# Test that the two BPMs are filtered by the limits-clean ------------------
linfiles = _copy_and_modify_linfiles(tmp_path, columns=columns, index=[2, 3], by=0.1)
unfiltered = {p: tfs.read(f) for p, f in linfiles.items()}

# choosing values so that both planes are filtered
# X tunes are 0.26 + 0.1, Y tunes are 0.32 + 0.1
clean_columns(files=linfiles.values(), columns=plane_columns, limit=(0.20, 0.35))

filtered = {p: tfs.read(f) for p, f in linfiles.items()}

for plane in PLANES:
assert len(filtered[plane]) == 3
assert unfiltered[plane][COL_NAME][2] not in filtered[plane][COL_NAME].to_list()
assert unfiltered[plane][COL_NAME][3] not in filtered[plane][COL_NAME].to_list()


# Test that keep flag is also respected in the limits-clean ----------------
linfiles = _copy_and_modify_linfiles(tmp_path, columns=columns, index=[2, 3], by=0.1)
unfiltered = {p: tfs.read(f) for p, f in linfiles.items()}

# choosing values so that both planes are filtered
# X tunes are 0.26 + 0.1, Y tunes are 0.32 + 0.1
clean_columns(files=linfiles.values(), columns=plane_columns, limit=(0.20, 0.35), keep=[unfiltered["X"][COL_NAME][2]])

filtered = {p: tfs.read(f) for p, f in linfiles.items()}

for plane in PLANES:
assert len(filtered[plane]) == 4
assert unfiltered[plane][COL_NAME][2] in filtered[plane][COL_NAME].to_list()
assert unfiltered[plane][COL_NAME][3] not in filtered[plane][COL_NAME].to_list()


@pytest.mark.basic
def test_backup_and_restore(tmp_path):
"""Test that the backup and restore functionality works."""
Expand Down Expand Up @@ -106,7 +198,7 @@ def test_main(tmp_path):
unfiltered = {p: tfs.read(f) for p, f in linfiles.items()}

# if limit not given, would filter two elements in X
main(files=list(linfiles.values()), columns=plane_columns, limit=0.01, backup=True)
main(files=list(linfiles.values()), columns=plane_columns, limit=[0.01], backup=True)
_assert_nlinfiles(tmp_path, 2)

filtered = {p: tfs.read(f) for p, f in linfiles.items()}
Expand Down

0 comments on commit 2cf93e1

Please sign in to comment.