diff --git a/scripts/src/main/bash/oneliners.txt b/scripts/src/main/bash/oneliners.txt index 862d5152a5..1ba81a87db 100644 --- a/scripts/src/main/bash/oneliners.txt +++ b/scripts/src/main/bash/oneliners.txt @@ -115,13 +115,12 @@ wget --background --spider --recursive --level 1 --domains=downloads.dbpedia.org # - may fail for other reasons, depending on file content, shell settings, etc. sed -r 's/%([89ABCDEF])/\\x\1/g' abc.nt | while read -r line ; do echo -e $line ; done > abc.ttl +# list all page-links files +ls -1 */*/*-page-links*.{nt,ttl}.gz | grep -v -E 'en-uris|unredirected' > page-links.txt + # generate in-link counts -# (include unredirected, may be interesting) -ls -1 */*/*-page-links*.{nt,ttl}.gz | grep -v -E 'en-uris' > ../page-links-in.txt -cat ../page-links-in.txt | while read i ; do echo $i; gzip -d < $i | grep -v '^#' | cut -d ' ' -f 3 | env LC_ALL=C sort | uniq -c | sort -k 1,1 -n -s -r | awk '{print $2" \""$1"\"^^ ." }' | gzip -c > ${i/page-links/page-in-link-counts} ; done &> ../page-in-link-counts.log & +( export LC_ALL=C ; cat page-links.txt | while read i ; do echo $i; gzip -d < $i | grep -v '^#' | cut -d ' ' -f 3 | sort | uniq -c | sort -k 1,1 -n -s -r | awk '{print $2" \""$1"\"^^ ." }' | gzip -c > ${i/page-links/page-in-link-counts} ; done &> page-in-link-counts.log & ) # generate out-link count -# (exclude unredirected, yields identical results) -ls -1 */*/*-page-links*.{nt,ttl}.gz | grep -v -E 'en-uris|unredirected' > ../page-links-out.txt # (the first sort is not really necessary - our triples are already grouped by subject, and grouping is enough for uniq - but it's nicer that the in-link and out-link counts are sorted similarly) -cat ../page-links-out.txt | while read i ; do echo $i; gzip -d < $i | grep -v '^#' | cut -d ' ' -f 1 | env LC_ALL=C sort | uniq -c | sort -k 1,1 -n -s -r | awk '{print $2" \""$1"\"^^ ." }' | gzip -c > ${i/page-links/page-out-link-counts} ; done &> ../page-out-link-counts.log & +( export LC_ALL=C ; cat page-links.txt | while read i ; do echo $i; gzip -d < $i | grep -v '^#' | cut -d ' ' -f 1 | sort | uniq -c | sort -k 1,1 -n -s -r | awk '{print $2" \""$1"\"^^ ." }' | gzip -c > ${i/page-links/page-out-link-counts} ; done &> page-out-link-counts.log & )