From d8872ad51df5c4dc26d7623a36c86557e5e4ece4 Mon Sep 17 00:00:00 2001
From: Javier <javier.millanacosta@maastrichtuniversity.nl>
Date: Mon, 22 Jul 2024 14:17:47 +0200
Subject: [PATCH] Joined jobs

---
 .github/workflows/wikidata.yml | 326 ++++++++++++++++-----------------
 1 file changed, 155 insertions(+), 171 deletions(-)

diff --git a/.github/workflows/wikidata.yml b/.github/workflows/wikidata.yml
index 9a8d74a..2693719 100644
--- a/.github/workflows/wikidata.yml
+++ b/.github/workflows/wikidata.yml
@@ -124,174 +124,158 @@ jobs:
            with: 
              filename: issue.md 
 
-  test_data:
-    runs-on: ubuntu-22.04
-    needs: query_wikidata
-    if: needs.query_wikidata.outputs.TIMEOUT == 'false'
-    steps:
-      - name: Checkout
-        uses: actions/checkout@v4
-      - name: Trigger docker release
-        uses: peter-evans/repository-dispatch@v3
-        with:
-          token: ${{ secrets.PING }}
-          repository: sec2pri/omicsFixID
-          event-type: update-event
-          client-payload: '{"ref": "${{ github.ref }}", "sha": "${{ github.sha }}", "datasource": "Wikidata"}'
-
-
-      - name: RegEx and Diff test
-        if:
-          ${{ env.FAILED == 'false' }}
-        run: |
-          chmod +x datasources/Wikidata/config
-          . datasources/Wikidata/config .
-          file_to_diff="geneProtein_secID2priID.tsv"
-          old="datasources/Wikidata/data/$file_to_diff"
-          new="datasources/Wikidata/data/results/$file_to_diff"
-          # remove headers
-          sed -i '1d' "$new"
-          sed -i '1d' "$old"
-          # qc integrity of IDs
-          wget -nc https://raw.githubusercontent.com/bridgedb/datasources/main/datasources.tsv
-          Wikidata_ID=$(awk -F '\t' '$1 == "Wikidata" {print $10}' datasources.tsv)
-          # Split the file into two separate files for each column
-          awk -F '\t' '{print $1}' $new > column1.txt
-          awk -F '\t' '{print $2}' $new > column2.txt
-
-          # Use grep to check if any line in the primary column doesn't match the pattern
-          if grep -nqvE "$Wikidata_ID" "column1.txt"; then
-            echo "All lines in the primary column match the pattern."
-          else
-            echo "Error: At least one line in the primary column does not match pattern."
-            grep -nvE "^$Wikidata_ID$" "column1.txt"
-            echo "FAILED=true" >> $GITHUB_ENV
-            exit 1
-          fi
-
-          # Use grep to check if any line in the secondary column doesn't match the pattern
-          if grep -nqvE "$Wikidata_ID" "column1.txt"; then
-            echo "All lines in the secondary column match the pattern."
-            
-          else
-            echo "Error: At least one line in the secondary column does not match pattern."
-            grep -nqvE "$Wikidata_ID" "column2.txt"
-            echo "FAILED=true" >> $GITHUB_ENV
-            exit 1
-          fi
-          # sort them
-          cat "$old" | sort | tr -d "\r" > ids_old.txt
-          cat "$new" | sort | tr -d "\r" > ids_new.txt
-          echo "Performing diff between the sorted lists of IDs"
-          # Perform a diff between the sorted lists of IDs
-          output_file=diff.txt
-          diff -u ids_old.txt ids_new.txt > $output_file || true
-          # retrieve new lines
-          added=$(grep '^+Wikidata' "$output_file" | sed 's/-//g') || true
-          # retrieve removed lines
-          removed=$(grep '^-' "$output_file" | sed 's/-//g') || true
-          # Create temporary files
-          tmp_added=$(mktemp)
-          tmp_withdrawn=$(mktemp)
-          tmp_removed=$(mktemp)
-          # Write the content of the added variable to the temporary file
-          echo "$added" > "$tmp_added"
-          # Retrieve matches and store them in another temporary file
-          grep 'Entry Withdrawn' "$tmp_added" > "$tmp_withdrawn" || true
-          # Append matches to the removed variable
-          if [ -n "$removed" ]; then
-            echo -e "$removed\n$(cat $tmp_withdrawn)" > "$tmp_removed"
-          else
-            cat "$tmp_withdrawn" > "$tmp_removed"
-          fi
-          # Remove matching lines from the added variable
-          sed '/Entry Withdrawn/d' "$tmp_added" > "$tmp_added.tmp" && mv "$tmp_added.tmp" "$tmp_added"
-          # Read the updated content back into the variables
-          added=$(cat "$tmp_added")
-          removed=$(cat "$tmp_removed")                 
-          # Clean up temporary files
-          rm "$tmp_added" "$tmp_withdrawn" "$tmp_removed"
-          added_filtered=$(comm -23 <(sort <<< "$added") <(sort <<< "$removed"))
-          removed_filtered=$(comm -23 <(sort <<< "$removed") <(sort <<< "$added"))
-          added=$added_filtered
-          removed=$removed_filtered
-          # count them
-          count_removed=$(printf "$removed" | wc -l) || true
-          count_added=$(printf "$added" | wc -l) || true
-          # make sure we are not counting empty lines
-          if [ -z "$removed" ]; then
-           count_removed=0
-           removed="None"
-          fi
-          if [ -z "$added" ]; then
-           count_added=0
-           added="None"
-          fi
-          echo ________________________________________________
-          echo "                 removed pairs                    "
-          echo ________________________________________________
-          echo "$removed"
-          echo ________________________________________________
-          echo "                 added pairs                    "
-          echo ________________________________________________
-          echo "$added"
-          echo _________________________________________________
-          echo "What's changed:"
-          echo "- Added id pairs: $count_added"
-          echo "- Removed id pairs: $count_removed"
-          # Store to env to use in issue
-          echo "ADDED=$count_added" >> $GITHUB_ENV
-          echo "REMOVED=$count_removed" >> $GITHUB_ENV
-          count=$(expr $count_added + $count_removed) || true
-          echo "COUNT=$count" >> $GITHUB_ENV
-          total_old=$(cat "$old" | wc -l) || true 
-          change=$((100 * count / total_old))
-          echo "CHANGE=$change" >> $GITHUB_ENV 
-
-      - name: 'Upload processed data as artifacts'
-        id: artifact-upload
-        uses: actions/upload-artifact@v4
-        with:
-          name: Wikidata_processed
-          path: datasources/Wikidata/data/results/*
-
-      - uses: JasonEtco/create-an-issue@v2
-        if: ${{ env.COUNT != 0 }}
-        name: Post issue about update availability
-        env:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-          SOURCE: Wikidata
-        with:
-          filename: .github/ISSUE_TEMPLATE/ISSUE_UPDATE.md   
-          update_existing: true
-       
-      - uses: JasonEtco/create-an-issue@v2
-        name: Post issue about failing test
-        if: ${{ env.FAILED == 'true' }}
-        env:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-          SOURCE: Wikidata
-        with:
-          filename: .github/ISSUE_TEMPLATE/ISSUE_FAIL.md
-          update_existing: true
-
-      - name: Trigger docker release
-        uses: peter-evans/repository-dispatch@v3
-        if: ${{ env.COUNT != 0 }}
-        env:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-          RELEASE_NUMBER: ${{ needs.query_wikidata.outputs.RELEASE_NUMBER }}
-          RELEASE_DATE: ${{ needs.query_wikidata.outputs.DATE_NEW }}
-        with:
-          token: ${{ secrets.PING }}
-          repository: sec2pri/omicsFixID
-          event-type: update-event
-          client-payload: >
-            {
-              "ref": "${{ github.ref }}",
-              "sha": "${{ github.sha }}",
-              "datasource": "Wikidata",
-              "version": "${{ env.RELEASE_NUMBER }}",
-              "date": "${{ env.RELEASE_DATE }}",
-              "processed_data": "https://github.com/sec2pri/mapping_preprocessing/actions/runs/${{ github.run_id }}/artifacts/${{ steps.artifact-upload.outputs.artifact-id }}"
-            }
\ No newline at end of file
+         - name: RegEx and Diff test
+           if:
+             ${{ env.FAILED == 'false' }}
+           run: |
+             chmod +x datasources/Wikidata/config
+             . datasources/Wikidata/config .
+             file_to_diff="geneProtein_secID2priID.tsv"
+             old="datasources/Wikidata/data/$file_to_diff"
+             new="datasources/Wikidata/data/results/$file_to_diff"
+             # remove headers
+             sed -i '1d' "$new"
+             sed -i '1d' "$old"
+             # qc integrity of IDs
+             wget -nc https://raw.githubusercontent.com/bridgedb/datasources/    main/datasources.tsv
+             Wikidata_ID=$(awk -F '\t' '$1 == "Wikidata" {print $10}'    datasources.tsv)
+             # Split the file into two separate files for each column
+             awk -F '\t' '{print $1}' $new > column1.txt
+             awk -F '\t' '{print $2}' $new > column2.txt
+           
+             # Use grep to check if any line in the primary column doesn't     match the pattern
+             if grep -nqvE "$Wikidata_ID" "column1.txt"; then
+               echo "All lines in the primary column match the pattern."
+             else
+               echo "Error: At least one line in the primary column does not     match pattern."
+               grep -nvE "^$Wikidata_ID$" "column1.txt"
+               echo "FAILED=true" >> $GITHUB_ENV
+               exit 1
+             fi
+           
+             # Use grep to check if any line in the secondary column doesn't     match the pattern
+             if grep -nqvE "$Wikidata_ID" "column1.txt"; then
+               echo "All lines in the secondary column match the pattern."
+               
+             else
+               echo "Error: At least one line in the secondary column does not     match pattern."
+               grep -nqvE "$Wikidata_ID" "column2.txt"
+               echo "FAILED=true" >> $GITHUB_ENV
+               exit 1
+             fi
+             # sort them
+             cat "$old" | sort | tr -d "\r" > ids_old.txt
+             cat "$new" | sort | tr -d "\r" > ids_new.txt
+             echo "Performing diff between the sorted lists of IDs"
+             # Perform a diff between the sorted lists of IDs
+             output_file=diff.txt
+             diff -u ids_old.txt ids_new.txt > $output_file || true
+             # retrieve new lines
+             added=$(grep '^+Wikidata' "$output_file" | sed 's/-//g') || true
+             # retrieve removed lines
+             removed=$(grep '^-' "$output_file" | sed 's/-//g') || true
+             # Create temporary files
+             tmp_added=$(mktemp)
+             tmp_withdrawn=$(mktemp)
+             tmp_removed=$(mktemp)
+             # Write the content of the added variable to the temporary file
+             echo "$added" > "$tmp_added"
+             # Retrieve matches and store them in another temporary file
+             grep 'Entry Withdrawn' "$tmp_added" > "$tmp_withdrawn" || true
+             # Append matches to the removed variable
+             if [ -n "$removed" ]; then
+               echo -e "$removed\n$(cat $tmp_withdrawn)" > "$tmp_removed"
+             else
+               cat "$tmp_withdrawn" > "$tmp_removed"
+             fi
+             # Remove matching lines from the added variable
+             sed '/Entry Withdrawn/d' "$tmp_added" > "$tmp_added.tmp" && mv    "$tmp_added.tmp" "$tmp_added"
+             # Read the updated content back into the variables
+             added=$(cat "$tmp_added")
+             removed=$(cat "$tmp_removed")                 
+             # Clean up temporary files
+             rm "$tmp_added" "$tmp_withdrawn" "$tmp_removed"
+             added_filtered=$(comm -23 <(sort <<< "$added") <(sort <<<     "$removed"))
+             removed_filtered=$(comm -23 <(sort <<< "$removed") <(sort <<<     "$added"))
+             added=$added_filtered
+             removed=$removed_filtered
+             # count them
+             count_removed=$(printf "$removed" | wc -l) || true
+             count_added=$(printf "$added" | wc -l) || true
+             # make sure we are not counting empty lines
+             if [ -z "$removed" ]; then
+              count_removed=0
+              removed="None"
+             fi
+             if [ -z "$added" ]; then
+              count_added=0
+              added="None"
+             fi
+             echo ________________________________________________
+             echo "                 removed pairs                    "
+             echo ________________________________________________
+             echo "$removed"
+             echo ________________________________________________
+             echo "                 added pairs                    "
+             echo ________________________________________________
+             echo "$added"
+             echo _________________________________________________
+             echo "What's changed:"
+             echo "- Added id pairs: $count_added"
+             echo "- Removed id pairs: $count_removed"
+             # Store to env to use in issue
+             echo "ADDED=$count_added" >> $GITHUB_ENV
+             echo "REMOVED=$count_removed" >> $GITHUB_ENV
+             count=$(expr $count_added + $count_removed) || true
+             echo "COUNT=$count" >> $GITHUB_ENV
+             total_old=$(cat "$old" | wc -l) || true 
+             change=$((100 * count / total_old))
+             echo "CHANGE=$change" >> $GITHUB_ENV 
+           
+         - name: 'Upload processed data as artifacts'
+           id: artifact-upload
+           uses: actions/upload-artifact@v4
+           with:
+             name: Wikidata_processed
+             path: datasources/Wikidata/data/results/*
+           
+         - uses: JasonEtco/create-an-issue@v2
+           if: ${{ env.COUNT != 0 }}
+           name: Post issue about update availability
+           env:
+             GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+             SOURCE: Wikidata
+           with:
+             filename: .github/ISSUE_TEMPLATE/ISSUE_UPDATE.md   
+             update_existing: true
+           
+         - uses: JasonEtco/create-an-issue@v2
+           name: Post issue about failing test
+           if: ${{ env.FAILED == 'true' }}
+           env:
+             GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+             SOURCE: Wikidata
+           with:
+             filename: .github/ISSUE_TEMPLATE/ISSUE_FAIL.md
+             update_existing: true
+           
+         - name: Trigger docker release
+           uses: peter-evans/repository-dispatch@v3
+           if: ${{ env.COUNT != 0 }}
+           env:
+             GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+             RELEASE_NUMBER: ${{ needs.query_wikidata.outputs.RELEASE_NUMBER }}
+             RELEASE_DATE: ${{ needs.query_wikidata.outputs.DATE_NEW }}
+           with:
+             token: ${{ secrets.PING }}
+             repository: sec2pri/omicsFixID
+             event-type: update-event
+             client-payload: >
+               {
+                 "ref": "${{ github.ref }}",
+                 "sha": "${{ github.sha }}",
+                 "datasource": "Wikidata",
+                 "version": "${{ env.RELEASE_NUMBER }}",
+                 "date": "${{ env.RELEASE_DATE }}",
+                 "processed_data": "https://github.com/sec2pri/    mapping_preprocessing/actions/runs/${{ github.run_id }}/    artifacts/${{ steps.artifact-upload.outputs.artifact-id }}"
+               }
\ No newline at end of file