diff --git a/osm_fieldwork/update_xlsform.py b/osm_fieldwork/update_xlsform.py index dfcfa0e4..86a956b3 100644 --- a/osm_fieldwork/update_xlsform.py +++ b/osm_fieldwork/update_xlsform.py @@ -18,21 +18,6 @@ SURVEY_GROUP_NAME = "survey_questions" -def filter_df_empty_rows(df: pd.DataFrame, column: str = NAME_COLUMN): - """Remove rows with None values in the specified column. - - NOTE We retain 'end group' and 'end group' rows even if they have no name. - NOTE A generic df.dropna(how="all") would not catch accidental spaces etc. - """ - if column in df.columns: - # Only retain 'begin group' and 'end group' if 'type' column exists - if "type" in df.columns: - return df[(df[column].notna()) | (df["type"].isin(["begin group", "end group", "begin_group", "end_group"]))] - else: - return df[df[column].notna()] - return df - - def merge_dataframes(mandatory_df: pd.DataFrame, user_question_df: pd.DataFrame, digitisation_df: pd.DataFrame): """Merge multiple Pandas dataframes together, removing duplicate fields.""" # Remove empty rows from dataframes @@ -40,8 +25,13 @@ def merge_dataframes(mandatory_df: pd.DataFrame, user_question_df: pd.DataFrame, user_question_df = filter_df_empty_rows(user_question_df) digitisation_df = filter_df_empty_rows(digitisation_df) + # Handle matching translation fields for label, hint, required_message, etc. + # FIXME this isn't working properly yet + # mandatory_df, user_question_df, digitisation_df = handle_translations( + # mandatory_df, user_question_df, digitisation_df, fields=["label", "hint", "required_message"] + # ) + # Find common fields between user_question_df and mandatory_df or digitisation_df - # We use this to remove duplicates from the survey, giving our fields priority duplicate_fields = set(user_question_df[NAME_COLUMN]).intersection( set(mandatory_df[NAME_COLUMN]).union(set(digitisation_df[NAME_COLUMN])) ) @@ -84,6 +74,55 @@ def merge_dataframes(mandatory_df: pd.DataFrame, user_question_df: pd.DataFrame, ) +def handle_translations( + mandatory_df: pd.DataFrame, user_question_df: pd.DataFrame, digitisation_df: pd.DataFrame, fields: list[str] +): + """Handle translations, defaulting to English if no translations are present. + + Handles all field types that can be translated, such as + 'label', 'hint', 'required_message'. + """ + for field in fields: + # Identify translation columns for this field in the user_question_df + translation_columns = [col for col in user_question_df.columns if col.startswith(f"{field}::")] + + if field in user_question_df.columns and not translation_columns: + # If user_question_df has only the base field (e.g., 'label'), map English translation from mandatory and digitisation + mandatory_df[field] = mandatory_df.get(f"{field}::English(en)", mandatory_df.get(field)) + digitisation_df[field] = digitisation_df.get(f"{field}::English(en)", digitisation_df.get(field)) + + # Then drop translation columns + mandatory_df = mandatory_df.loc[:, ~mandatory_df.columns.str.startswith("label::")] + digitisation_df = digitisation_df.loc[:, ~digitisation_df.columns.str.startswith("label::")] + + else: + # If translation columns exist, match them for mandatory and digitisation dataframes + for col in translation_columns: + mandatory_col = mandatory_df.get(col) + digitisation_col = digitisation_df.get(col) + if mandatory_col is not None: + mandatory_df[col] = mandatory_col + if digitisation_col is not None: + digitisation_df[col] = digitisation_col + + return mandatory_df, user_question_df, digitisation_df + + +def filter_df_empty_rows(df: pd.DataFrame, column: str = NAME_COLUMN): + """Remove rows with None values in the specified column. + + NOTE We retain 'end group' and 'end group' rows even if they have no name. + NOTE A generic df.dropna(how="all") would not catch accidental spaces etc. + """ + if column in df.columns: + # Only retain 'begin group' and 'end group' if 'type' column exists + if "type" in df.columns: + return df[(df[column].notna()) | (df["type"].isin(["begin group", "end group", "begin_group", "end_group"]))] + else: + return df[df[column].notna()] + return df + + def create_survey_group(name: str) -> dict[str, pd.DataFrame]: """Helper function to create a begin and end group for XLSForm.""" begin_group = pd.DataFrame( diff --git a/tests/test_update_xlsform.py b/tests/test_update_xlsform.py index d98f0757..7fd0dbd7 100644 --- a/tests/test_update_xlsform.py +++ b/tests/test_update_xlsform.py @@ -48,6 +48,12 @@ async def test_merge_mandatory_fields(): # Check it's still a valid xlsform by converting to XML xform_convert(updated_form) + # Check if translations were matched correctly + # FIXME enable once code fixed + # translation_found, label_field_found = check_translation_fields(workbook) + # assert not translation_found, "Translation fields should have been removed during merge." + # assert label_field_found, "The 'label' field should be present after merge." + async def test_add_extra_select_from_file(): """Append extra select_one_from_file questions based on Entity list names.""" @@ -94,6 +100,11 @@ async def test_buildings_xlsform(): # Check it's still a valid xlsform by converting to XML xform_convert(updated_form) + workbook = load_workbook(filename=BytesIO(updated_form.getvalue())) + translation_found, label_field_found = check_translation_fields(workbook) + assert translation_found, "'label::English(en)' field not found in the survey sheet." + assert not label_field_found, "'label' field should not be present after merging translations." + async def test_healthcare_xlsform(): """Merge and test if buildings form is a valid XLSForm.""" @@ -152,6 +163,26 @@ def check_form_title(workbook: Workbook) -> None: assert form_title_value == "building", "form_title field is not set to 'building'" +def check_translation_fields(workbook: Workbook): + """Check if translation fields were correctly matched.""" + survey_sheet = workbook["survey"] + translation_found = False + label_field_found = False + + # Iterate through the survey sheet columns and rows + for row in survey_sheet.iter_rows(min_row=1, max_col=survey_sheet.max_column): + for cell in row: + # Check if the English translation label exists + if cell.value == "label::English(en)": + translation_found = True + + # Ensure that the base 'label' field is no longer present + if cell.value == "label": + label_field_found = True + + return translation_found, label_field_found + + def get_sheet(workbook: Workbook, sheet_name: str) -> worksheet.worksheet.Worksheet: """Helper function to get a sheet or raise an error.""" if sheet_name not in workbook.sheetnames: diff --git a/tests/testdata/test_form_for_mandatory_fields.xls b/tests/testdata/test_form_for_mandatory_fields.xls index ff9fa715..a6613a9b 100644 Binary files a/tests/testdata/test_form_for_mandatory_fields.xls and b/tests/testdata/test_form_for_mandatory_fields.xls differ