Skip to content

Commit

Permalink
bugfix: uniq needs LC_ALL=C too
Browse files Browse the repository at this point in the history
  • Loading branch information
jcsahnwaldt committed Sep 24, 2013
1 parent 7565029 commit 4afdfeb
Showing 1 changed file with 5 additions and 6 deletions.
11 changes: 5 additions & 6 deletions scripts/src/main/bash/oneliners.txt
Original file line number Diff line number Diff line change
Expand Up @@ -115,13 +115,12 @@ wget --background --spider --recursive --level 1 --domains=downloads.dbpedia.org
# - may fail for other reasons, depending on file content, shell settings, etc.
sed -r 's/%([89ABCDEF])/\\x\1/g' abc.nt | while read -r line ; do echo -e $line ; done > abc.ttl

# list all page-links files
ls -1 */*/*-page-links*.{nt,ttl}.gz | grep -v -E 'en-uris|unredirected' > page-links.txt

# generate in-link counts
# (include unredirected, may be interesting)
ls -1 */*/*-page-links*.{nt,ttl}.gz | grep -v -E 'en-uris' > ../page-links-in.txt
cat ../page-links-in.txt | while read i ; do echo $i; gzip -d < $i | grep -v '^#' | cut -d ' ' -f 3 | env LC_ALL=C sort | uniq -c | sort -k 1,1 -n -s -r | awk '{print $2" <http://dbpedia.org/ontology/wikiPageInLinkCount> \""$1"\"^^<http://www.w3.org/2001/XMLSchema#integer> ." }' | gzip -c > ${i/page-links/page-in-link-counts} ; done &> ../page-in-link-counts.log &
( export LC_ALL=C ; cat page-links.txt | while read i ; do echo $i; gzip -d < $i | grep -v '^#' | cut -d ' ' -f 3 | sort | uniq -c | sort -k 1,1 -n -s -r | awk '{print $2" <http://dbpedia.org/ontology/wikiPageInLinkCount> \""$1"\"^^<http://www.w3.org/2001/XMLSchema#integer> ." }' | gzip -c > ${i/page-links/page-in-link-counts} ; done &> page-in-link-counts.log & )

# generate out-link count
# (exclude unredirected, yields identical results)
ls -1 */*/*-page-links*.{nt,ttl}.gz | grep -v -E 'en-uris|unredirected' > ../page-links-out.txt
# (the first sort is not really necessary - our triples are already grouped by subject, and grouping is enough for uniq - but it's nicer that the in-link and out-link counts are sorted similarly)
cat ../page-links-out.txt | while read i ; do echo $i; gzip -d < $i | grep -v '^#' | cut -d ' ' -f 1 | env LC_ALL=C sort | uniq -c | sort -k 1,1 -n -s -r | awk '{print $2" <http://dbpedia.org/ontology/wikiPageOutLinkCount> \""$1"\"^^<http://www.w3.org/2001/XMLSchema#integer> ." }' | gzip -c > ${i/page-links/page-out-link-counts} ; done &> ../page-out-link-counts.log &
( export LC_ALL=C ; cat page-links.txt | while read i ; do echo $i; gzip -d < $i | grep -v '^#' | cut -d ' ' -f 1 | sort | uniq -c | sort -k 1,1 -n -s -r | awk '{print $2" <http://dbpedia.org/ontology/wikiPageOutLinkCount> \""$1"\"^^<http://www.w3.org/2001/XMLSchema#integer> ." }' | gzip -c > ${i/page-links/page-out-link-counts} ; done &> page-out-link-counts.log & )

0 comments on commit 4afdfeb

Please sign in to comment.