.github/workflows/chebi.yml

name: Check and test ChEBI updates

on:
  workflow_dispatch:
  pull_request: # tests whether it is working on PR
     paths:
     - '.github/workflows/chebi.yml'
  schedule:
    - cron: "0 0 1,15 * *"  # Run the workflow on the 1st and 15th day of each month
permissions:
  contents: write
  pages: write
  id-token: write
  issues: write


jobs:
  check_new_release:
    runs-on: ubuntu-latest
    name: Check latest release date
    outputs:
      RELEASE_NUMBER: ${{ steps.check_download.outputs.RELEASE_NUMBER }}
      CURRENT_RELEASE_NUMBER: ${{ steps.check_download.outputs.CURRENT_RELEASE_NUMBER }}
      NEW_RELEASE: ${{ steps.check_download.outputs.NEW_RELEASE }}
      DATE_NEW: ${{ steps.check_download.outputs.DATE_NEW }}
      DATE_OLD: ${{ steps.check_download.outputs.DATE_OLD }}
    steps:
      # checkout the repository
      - name: Checkout
        uses: actions/checkout@v4
      # check the release date for the latest ChEBI release
      - name: Check for new ChEBI release
        id: check_download
        run: |
          ## Read config
          . datasources/chebi/config .
          echo 'Accessing the ChEBI archive'
          wget https://ftp.ebi.ac.uk/pub/databases/chebi/archive/ -O chebi_index.html
          echo "CURRENT_RELEASE_NUMBER=$release" >> $GITHUB_OUTPUT
          ## Check date for last element in index
          ##Extract the date from the latest release (up to the day)
          date_new=$(tail -4 chebi_index.html | head -1 | grep -oP '<td align="right">\K[0-9-]+\s[0-9:]+(?=\s+</td>)' | awk '{print $1}')
          release=$(tail -4 chebi_index.html | head -1 | grep -oP '(?<=a href="rel)\d\d\d')
          #https://docs.github.com/en/actions/using-workflows/workflow-commands-for-github-actions#setting-an-output-parameter
          echo "RELEASE_NUMBER=$release" >> $GITHUB_OUTPUT
          ##Extract the date from the ChEBI README file 
          date_old=$date
          #Store dates to output
          echo "DATE_OLD=$date_old" >> $GITHUB_OUTPUT
          echo "DATE_NEW=$date_new" >> $GITHUB_OUTPUT
          ##Compare the dates and set variable if date_new is more recent
          timestamp1=$(date -d "$date_new" +%s)
          timestamp2=$(date -d "$date_old" +%s)
          if [ "$timestamp1" -gt "$timestamp2" ]; then
            echo 'New release available', "$release"
            echo "NEW_RELEASE=true" >> $GITHUB_OUTPUT
          else
            echo 'No new release available'
          fi
          echo "Date of latest release: $date_new", "Date of release of the current version: $date_old"
          ##Clean up
          rm chebi_index.html
  
  test_sdf_processing:
    name: Test release
    if: needs.check_new_release.outputs.NEW_RELEASE
    needs: check_new_release
    env:
      RELEASE_NUMBER: ${{ needs.check_new_release.outputs.RELEASE_NUMBER }}
      DATE_OLD: ${{ needs.check_new_release.outputs.DATE_OLD }}
      DATE_NEW: ${{ needs.check_new_release.outputs.DATE_NEW }}
      CURRENT_RELEASE_NUMBER: ${{ needs.check_new_release.outputs.CURRENT_RELEASE_NUMBER }}
    runs-on: ubuntu-latest
    steps:
      - name: Checkout repository
        uses: actions/checkout@v4
      - name: Download SDF for new release
        run: |
          echo "$DATE_NEW=$DATE_NEW" >> $GITHUB_ENV
          ##Download ChEBI SDF file
          echo $RELEASE_NUMBER
          # Store outputs from previous job in environment variables
          echo "RELEASE_NUMBER=$RELEASE_NUMBER" >> $GITHUB_ENV
          echo "CURRENT_RELEASE_NUMBER=$CURRENT_RELEASE_NUMBER" >> $GITHUB_ENV
          url_release="https://ftp.ebi.ac.uk/pub/databases/chebi/archive/rel$RELEASE_NUMBER/SDF/"
          echo "URL_RELEASE=$url_release" >> $GITHUB_ENV
          wget "https://ftp.ebi.ac.uk/pub/databases/chebi/archive/rel${RELEASE_NUMBER}/SDF/ChEBI_complete_3star.sdf.gz"
          ##Unzip gz file:
          gunzip ChEBI_complete_3star.sdf.gz #TODO replace by config var
          ##Check file size if available
          ls
          ##Print file size
          # Set up vars from config file
          chmod +x datasources/chebi/config
          . datasources/chebi/config .
          ##Create temp. folder to store the data in
          mkdir -p mapping_preprocessing/datasources/chebi/data
     # step 4: run the Java .jar for ChEBI preprocessing 
      - name: Set up Java
        uses: actions/setup-java@v4
        with:
          java-version: '11'
          distribution: 'temurin'
      # Download current version from Zenodo
      #- name: Download current mapping file from Zenodo
      #  id: download_current_zenodo
      #  env:
      #    zenodo_token: ${{ secrets.ZENODO }}
      #  run: |
      #    # Set up vars from config file
      #    chmod +x datasources/chebi/config
      #    . datasources/chebi/config .
      #    echo "file name: $to_check_from_zenodo"
      #    # Request Zenodo API to download the file
      #    curl -H "Authorization: Bearer $zenodo_token" -LJO https://zenodo.org/api/record/$zenodo_file_id/$to_check_from_zenodo
      - name: Test SDF processing
        id: sdf_process
        run: |
          inputFile="ChEBI_complete_3star.sdf" 
          mkdir new
          outputDir="datasources/chebi/recentData/"
          # Run Java program and capture its exit code
          java -cp java/target/mapping_prerocessing-0.0.1-jar-with-dependencies.jar org.sec2pri.chebi_sdf "$inputFile" "$outputDir"
          # Check the exit status of the Java program
          if [ $? -eq 0 ]; then
              # Java program succeeded
              echo "Successful preprocessing of ChEBI data."
              echo "FAILED=false" >> $GITHUB_ENV
          else
              # Java program failed
              echo "Failed preprocessing of ChEBI data."
              echo "FAILED=true" >> $GITHUB_ENV
          fi
      - name: RegEx and Diff test
        if:
          ${{ env.FAILED == 'false' }}
        run: |
          chmod +x datasources/chebi/config
          . datasources/chebi/config .
          old="datasources/chebi/data/$to_check_from_zenodo"
          new="datasources/chebi/recentData/$to_check_from_zenodo"
          # remove headers
          sed -i '1d' "$new"
          sed -i '1d' "$old"
          # qc integrity of IDs
          wget -nc https://raw.githubusercontent.com/bridgedb/datasources/main/datasources.tsv
          CHEBI_ID=$(awk -F '\t' '$1 == "ChEBI" {print $10}' datasources.tsv)
          # Split the file into two separate files for each column
          awk -F '\t' '{print $1}' $new > column1.txt
          awk -F '\t' '{print $2}' $new > column2.txt

          # Use grep to check if any line in the primary column doesn't match the pattern
          if grep -nqvE "$CHEBI_ID" "column1.txt"; then
            echo "All lines in the primary column match the pattern."
          else
            echo "Error: At least one line in the primary column does not match pattern."
            grep -nvE "^$CHEBI_ID$" "column1.txt"
            echo "FAILED=true" >> $GITHUB_ENV
            exit 1
          fi

          # Use grep to check if any line in the secondary column doesn't match the pattern
          if grep -nqvE "$CHEBI_ID" "column1.txt"; then
            echo "All lines in the secondary column match the pattern."
            
          else
            echo "Error: At least one line in the secondary column does not match pattern."
            grep -nqvE "$CHEBI_ID" "column2.txt"
            echo "FAILED=true" >> $GITHUB_ENV
            exit 1
          fi
          # sort them
          cat "$old" | sort | tr -d "\r" > ids_old.txt
          cat "$new" | sort | tr -d "\r" > ids_new.txt
          echo "Performing diff between the sorted lists of IDs"
          # Perform a diff between the sorted lists of IDs
          output_file=diff.txt
          diff -u ids_old.txt ids_new.txt > $output_file || true
          # retrieve new lines
          added=$(grep '^+CHEBI' "$output_file" | sed 's/-//g') || true
          # retrieve removed lines
          removed=$(grep '^-' "$output_file" | sed 's/-//g') || true
          # Create temporary files
          tmp_added=$(mktemp)
          tmp_withdrawn=$(mktemp)
          tmp_removed=$(mktemp)

          # Write the content of the added variable to the temporary file
          echo "$added" > "$tmp_added"

          # Retrieve matches and store them in another temporary file
          grep 'Entry Withdrawn' "$tmp_added" > "$tmp_withdrawn" || true

          # Append matches to the removed variable
          if [ -n "$removed" ]; then
            echo -e "$removed\n$(cat $tmp_withdrawn)" > "$tmp_removed"
          else
            cat "$tmp_withdrawn" > "$tmp_removed"
          fi

          # Remove matching lines from the added variable
          sed '/Entry Withdrawn/d' "$tmp_added" > "$tmp_added.tmp" && mv "$tmp_added.tmp" "$tmp_added"

          # Read the updated content back into the variables
          added=$(cat "$tmp_added")
          removed=$(cat "$tmp_removed")                 
          # Clean up temporary files
          rm "$tmp_added" "$tmp_withdrawn" "$tmp_removed"
          added=$(echo "$added" | sed '/Entry Withdrawn/d')
          added_filtered=$(comm -23 <(sort <<< "$added") <(sort <<< "$removed"))
          removed_filtered=$(comm -23 <(sort <<< "$removed") <(sort <<< "$added"))
          added=$added_filtered
          removed=$removed_filtered
          # count them
          count_removed=$(printf "$removed" | wc -l) || true
          count_added=$(printf "$added" | wc -l) || true
          # make sure we are not counting empty lines
          if [ -z "$removed" ]; then
           count_removed=0
           removed="None"
          fi
          if [ -z "$added" ]; then
           count_added=0
           added="None"
          fi
          echo ________________________________________________
          echo "                 removed pairs                    "
          echo ________________________________________________
          echo "$removed"
          echo ________________________________________________
          echo "                 added pairs                    "
          echo ________________________________________________
          echo "$added"
          echo _________________________________________________
          echo "What's changed:"
          echo "- Added id pairs: $count_added"
          echo "- Removed id pairs: $count_removed"
          # Store to env to use in issue
          echo "ADDED=$count_added" >> $GITHUB_ENV
          echo "REMOVED=$count_removed" >> $GITHUB_ENV
          count=$(expr $count_added + $count_removed) || true
          echo "COUNT=$count" >> $GITHUB_ENV
          total_old=$(cat "$old" | wc -l) || true 
          change=$((100 * count / total_old))
          echo "CHANGE=$change" >> $GITHUB_ENV 

      - name: 'Upload processed data as artifacts'
        uses: actions/upload-artifact@v4
        with:
          name: chebi_processed
          path: datasources/chebi/recentData/*

      - uses: JasonEtco/create-an-issue@v2
        if: ${{ env.COUNT != 0 }}
        name: Post issue about update availability
        env:
          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
          SOURCE: ChEBI
        with:
          filename: .github/ISSUE_TEMPLATE/ISSUE_UPDATE.md   
          update_existing: true
       
      - uses: JasonEtco/create-an-issue@v2
        name: Post issue about failing test
        if: ${{ env.FAILED == 'true' }}
        env:
          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
          SOURCE: chebi
        with:
          filename: .github/ISSUE_TEMPLATE/ISSUE_FAIL.md
          update_existing: true
  # Docker
  trigger-docker-update:
    runs-on: ubuntu-22.04
    steps:
      - name: Checkout
        uses: actions/checkout@v4
      - name: Trigger docker release
        uses: peter-evans/repository-dispatch@v3
        with:
          token: ${{ secrets.PING }}
          repository: sec2pri/omicsFixID
          event-type: update-event
          client-payload: '{"ref": "${{ github.ref }}", "sha": "${{ github.sha }}", "datasource": "ChEBI"}'