From 4da1590a6704e7c5d6b874e1611d2670fae7d8b4 Mon Sep 17 00:00:00 2001 From: Andy Jackson Date: Wed, 22 Nov 2023 16:10:28 +0000 Subject: [PATCH] Update README a bit. --- README.md | 47 ++++++++++------------------------------------- 1 file changed, 10 insertions(+), 37 deletions(-) diff --git a/README.md b/README.md index 1ae0d71..e176988 100644 --- a/README.md +++ b/README.md @@ -8,21 +8,21 @@ Take crawl events and turn them into Apache Parquet so they can be queried using Requires Python >= 3.8 to avoid fastparquet bug https://github.com/dask/fastparquet/issues/825 +At present, there is no MrJob/Map-Reduce version, but that could be added if needed. + ## Example of use -14.5G 100 14.5G 0 0 48.1M 0 0:05:08 0:05:08 --:--:-- 52.2M -[ec2-user@fc crawl-db]$ wc crawl.log.cp00004 - 37357947 635085091 15578444671 crawl.log.cp00004 +During a crawl, we rapidly built up a lot of log activity. Some 37,357,947 log lines, in a 14.5GB file. This file was process like this: + +python -m crawldb.parquet.cli import crawl.log.cp00004-20231116123457 crawl-log-cp00004.parquet +This took a while, about an hour, but created a queryable file only 2.8GB in size. -(.venv) [ec2-user@fc crawl-db]$ head -1 /mnt/data/fc/crawl-db/crawl.log.cp00004 -2023-11-15T17:13:05.828Z -5003 - https://cdn.mos.cms.futurecdn.net/EaNfygyiNnxXmUAXnpcojG-768-80.jpeg LE https://www.wallpaper.com/fashion-beauty/sacai-mercedes-benz-amg-collaboration unknown #070 - - tid:94192:https://www.wallpaper.com/ Q:serverMaxSuccessKb {"scopeDecision":"ACCEPT by rule #2 MatchesRegexDecideRule"} -(.venv) [ec2-user@fc crawl-db]$ tail -1 /mnt/data/fc/crawl-db/crawl.log.cp00004 -2023-11-16T12:36:14.536Z 200 357908 https://www.thegazette.co.uk/sitemap-201709-1.xml.gz II https://www.thegazette.co.uk/sitemap.xml application/xml #137 20231116115454719+406 sha1:YUPZK5EJ7WRFWRRLD3CMGHH7ZQW3NDX7 tid:37575:https://www.thegazette.co.uk/ isSitemap,launchTimestamp:20231115090324,err=java.lang.NullPointerException,dol:50002,ip:18.165.201.84 {"contentSize":358576,"warcFilename":"BL-NPLD-20231116115015552-01164-44~npld-heritrix3-worker-1~8443.warc.gz","warcFileOffset":220898170,"scopeDecision":"ACCEPT by rule #1 WatchedFileSurtPrefixedDecideRule","warcFileRecordLength":460354} +Using the example queries (see `duckdb-query.py` for details), we could query this file very quickly, with even fairly intensive aggregation queries only requiring a second or so to run. - 1098 2023-11-16:13:05:01 python -m crawldb.parquet.cli import /mnt/data/fc/heritrix/output/frequent-npld/20231115123452/logs/crawl.log.cp00004-20231116123457 /mnt/data/fc/crawl-db/npld-cp00004.parquet - -rw-r--r--. 1 ec2-user ec2-user 2.8G Nov 16 15:33 npld-cp00004.parquet +``` +SELECT COUNT(*) FROM 'crawl-log-cp00004.parquet'; ┌──────────────┐ │ count_star() │ @@ -30,38 +30,11 @@ Requires Python >= 3.8 to avoid fastparquet bug https://github.com/dask/fastparq ├──────────────┤ │ 37357947 │ └──────────────┘ - - -┌──────────────────────┬──────────────────────┬──────────────────────┬───────────────┬─────────┬───┬───────────────┬─────────────┬─────────────┬────────────┐ -│ id │ url │ host │ domain │ ip │ … │ warc_filename │ warc_offset │ warc_length │ wire_bytes │ -│ varchar │ varchar │ varchar │ varchar │ varchar │ │ varchar │ int64 │ int64 │ int64 │ -├──────────────────────┼──────────────────────┼──────────────────────┼───────────────┼─────────┼───┼───────────────┼─────────────┼─────────────┼────────────┤ -│ 2023-11-15T17:13:0… │ https://cdn.mos.cm… │ cdn.mos.cms.future… │ futurecdn.net │ NULL │ … │ NULL │ NULL │ NULL │ NULL │ -├──────────────────────┴──────────────────────┴──────────────────────┴───────────────┴─────────┴───┴───────────────┴─────────────┴─────────────┴────────────┤ -│ 1 rows 22 columns (9 shown) │ -└───────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────┘ - -┌──────────────────────┬──────────────────────┬──────────────────────┬──────────────────┬───┬──────────────────────┬─────────────┬─────────────┬────────────┐ -│ id │ url │ host │ domain │ … │ warc_filename │ warc_offset │ warc_length │ wire_bytes │ -│ varchar │ varchar │ varchar │ varchar │ │ varchar │ int64 │ int64 │ int64 │ -├──────────────────────┼──────────────────────┼──────────────────────┼──────────────────┼───┼──────────────────────┼─────────────┼─────────────┼────────────┤ -│ 2023-11-16T12:36:1… │ https://www.thegaz… │ www.thegazette.co.uk │ thegazette.co.uk │ … │ BL-NPLD-2023111611… │ 220898170 │ 460354 │ 358576 │ -├──────────────────────┴──────────────────────┴──────────────────────┴──────────────────┴───┴──────────────────────┴─────────────┴─────────────┴────────────┤ -│ 1 rows 22 columns (8 shown) │ -└───────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────┘ - +``` -## To Do - -- Make some DC2023 data accessible via Apache Superset to explore if we need more fields. -- Match the Common Crawl parquet schema if possible. -- Remove Solr/CRDB code and simplify the dependencies as much as possible. -- Consider whether this can be integrated with the Kafka streams, so we can have recent data in a queriable form that is easier to manage than the ELK stack (which can't cope with domain crawl scale) -- Make a Map-Reduce version so we can generate parquet from crawl log lines efficiently. -- Allow integration of richer data from WARCs, e.g. a `webarchive-discovery` tool-chain than can generate compatible Parquet files. ## Previous Designs