-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathMakefile
134 lines (99 loc) · 6.22 KB
/
Makefile
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
# Change the SRC_DIR to the directory containing the DNB EPUB files, e.g. with
# make -j 96 target/dnb13.index.tar.xz SRC_DIR=../sample.10000
SRC_DIR ?= /mnt/data/KorAP@DNB/Random-10-12-23.epub
# Change YEARS to the years you want to process, e.g. with
# make -j12 i5valid YEARS="18 19"
YEARS ?= $(shell seq -w 2012 2024 | sed 's/^.*\([0-9][0-9]\)/\1/')
BUILD_DIR = build
TARGET_DIR ?= target
DEPLOY_HOST ?= compute.ids-mannheim.de
DEPLOY_USER ?= korap
DEPLOY_PATH ?= /export/netapp/korap4dnb
MAX_THREADS ?= $(shell nproc)
MAKE ?= make -j $(shell nproc)
KORAPXML2CONLLU_HEAP ?= $(shell echo "$$(($(MAX_THREADS) * 2500))")
KORAPXML2CONLLU ?= java -Xmx$(KORAPXML2CONLLU_HEAP)m -jar lib/korapxml2conllu.jar
SAXON ?= java -Djava.util.logging.config.file=/logging.properties -cp lib/saxon-ee-12.4.jar:lib/xmlresolver-5.2.2.jar:lib/textclassifier.jar:lib/xmlresolver-5.2.2-data.jar net.sf.saxon.Transform -expand:off
.DELETE_ON_ERROR:
.PHONY: all clean test i5 i5valid krill index deploy server-log server-status
.PRECIOUS: $(TARGET_DIR)/%.i5.xml $(TARGET_DIR)/dnb%.pre.i5.xml %.zip %.tree_tagger.zip %.ud.zip %.marmot-malt.zip %.spacy.zip %.i5.xml %.tar
all: index
krill: $(foreach year,$(YEARS),$(TARGET_DIR)/dnb$(year).krill.tar)
index: $(TARGET_DIR)/dnb.index
$(TARGET_DIR)/dnb%.i5.xml: $(TARGET_DIR)/dnb%.pre.i5.xml xslt/pass2.xsl xslt/pass3.xsl models/dereko_domains_s.classifier
$(SAXON) -xsl:xslt/pass2.xsl $< | $(SAXON) -xsl:xslt/pass3.xsl - > $@
$(TARGET_DIR)/dnb%.pre.i5.xml: $(patsubst $(SRC_DIR)/%.epub,$(TARGET_DIR)/%.i5.xml,$(wildcard $(SRC_DIR)/*.epub))
@echo $(patsubst $(SRC_DIR)/%.epub,$(TARGET_DIR)/%.i5.xml,$(wildcard $(SRC_DIR)/*0.epub)) > $(TARGET_DIR)/filelist$*.txt
@echo $(patsubst $(SRC_DIR)/%.epub,$(TARGET_DIR)/%.i5.xml,$(wildcard $(SRC_DIR)/*1.epub)) >> $(TARGET_DIR)/filelist$*.txt
@echo $(patsubst $(SRC_DIR)/%.epub,$(TARGET_DIR)/%.i5.xml,$(wildcard $(SRC_DIR)/*2.epub)) >> $(TARGET_DIR)/filelist$*.txt
@echo $(patsubst $(SRC_DIR)/%.epub,$(TARGET_DIR)/%.i5.xml,$(wildcard $(SRC_DIR)/*3.epub)) >> $(TARGET_DIR)/filelist$*.txt
@echo $(patsubst $(SRC_DIR)/%.epub,$(TARGET_DIR)/%.i5.xml,$(wildcard $(SRC_DIR)/*4.epub)) >> $(TARGET_DIR)/filelist$*.txt
@echo $(patsubst $(SRC_DIR)/%.epub,$(TARGET_DIR)/%.i5.xml,$(wildcard $(SRC_DIR)/*5.epub)) >> $(TARGET_DIR)/filelist$*.txt
@echo $(patsubst $(SRC_DIR)/%.epub,$(TARGET_DIR)/%.i5.xml,$(wildcard $(SRC_DIR)/*6.epub)) >> $(TARGET_DIR)/filelist$*.txt
@echo $(patsubst $(SRC_DIR)/%.epub,$(TARGET_DIR)/%.i5.xml,$(wildcard $(SRC_DIR)/*7.epub)) >> $(TARGET_DIR)/filelist$*.txt
@echo $(patsubst $(SRC_DIR)/%.epub,$(TARGET_DIR)/%.i5.xml,$(wildcard $(SRC_DIR)/*8.epub)) >> $(TARGET_DIR)/filelist$*.txt
@echo $(patsubst $(SRC_DIR)/%.epub,$(TARGET_DIR)/%.i5.xml,$(wildcard $(SRC_DIR)/*9.epub)) >> $(TARGET_DIR)/filelist$*.txt
@echo $(patsubst $(SRC_DIR)/%.epub,$(TARGET_DIR)/%.i5.xml,$(wildcard $(SRC_DIR)/*X.epub)) >> $(TARGET_DIR)/filelist$*.txt
sed -i -e 's/ /\n/g; /^$$/d' $(TARGET_DIR)/filelist$*.txt
head -n -1 xslt/idsCorpus-template.xml | sed -e 's/{YY}/$*/' > $@
@while IFS= read -r f; do \
if head -500 "$$f" | grep -Eq '<pubDate type="year">..$*'; then \
cat "$$f" >> $@; \
fi; \
done < $(TARGET_DIR)/filelist$*.txt
tail -n 1 xslt/idsCorpus-template.xml >> $@
test: models/dereko_domains_s.classifier i5valid test/test-xml.sh
bash test/test-xml.sh
i5: $(foreach year,$(YEARS),$(TARGET_DIR)/dnb$(year).i5.xml)
i5valid: i5
xmllint --noout --valid $(foreach year,$(YEARS),$(TARGET_DIR)/dnb$(year).i5.xml)
$(BUILD_DIR)/%: $(SRC_DIR)/%.epub
mkdir -p $@
echo "Converting $< to $@"
unzip -q -o $< -d $@
chmod -R ug+rwX $@
$(TARGET_DIR)/%.i5.xml: $(BUILD_DIR)/% xslt/epub2i5.xsl xslt/idsCorpus-template.xml
mkdir -p $(TARGET_DIR)
echo "Converting $< to $@"
$(SAXON) -xsl:xslt/epub2i5.xsl $(shell find $< -name "*.opf") > $@ || (echo "WARN: ignoring invalid $@" && > $@)
%.zip: %.i5.xml
tei2korapxml -l warn -s -tk - < $< > $@
%.tree_tagger.zip: %.zip
$(KORAPXML2CONLLU) $< | pv | docker run --rm -i korap/conllu2treetagger -l german | conllu2korapxml > $@
%.spacy.zip: %.zip
$(KORAPXML2CONLLU) $< | pv | docker run --rm -i korap/conllu2spacy | conllu2korapxml > $@
models/de.marmot:
mkdir -p models
curl -sL -o $@ https://cistern.cis.lmu.de/marmot/models/CURRENT/spmrl/de.marmot
models/german.mco:
mkdir -p models
curl -sL -o $@ https://corpora.ids-mannheim.de/tools/$@
models/dereko_domains_s.classifier:
mkdir -p models
curl -sL -o $@ https://corpora.ids-mannheim.de/tools/$@
%.marmot-malt.zip: %.zip models/de.marmot models/german.mco
$(KORAPXML2CONLLU) -T $(MAX_THREADS) -t marmot:models/de.marmot -P malt:models/german.mco $< | conllu2korapxml -f "marmot dependency:malt" > $@
%.ud.zip: %.zip
$(KORAPXML2CONLLU) $< | pv | ./scripts/udpipe2 | conllu2korapxml > $@
%.krill.tar: %.zip %.marmot-malt.zip %.tree_tagger.zip
mkdir -p ${BUILD_DIR}/krill/$(basename $@)
mkdir -p $(basename $@)
korapxml2krill archive --quiet -w -z -cfg krill-korap4dnb.cfg -c ${BUILD_DIR}/krill/$(basename $@)/korapxml2krill.cache -j $(MAX_THREADS) -te ${BUILD_DIR}/krill/$(basename $@) --non-word-tokens --meta I5 -i $< -i $(word 2,$^) -i $(word 3,$^) -o $(basename $@)
%.json: %.krill.tar
rm -rf $@
mkdir -p $@
for f in $<; do tar -C $@ -xf $$f; done
$(TARGET_DIR)/dnb.index: $(foreach year,$(YEARS),$(TARGET_DIR)/dnb$(year).json)
rm -rf $@
java -jar lib/Krill-Indexer.jar -c lib/krill.conf -i $(subst " ",;,$^) -o $@
$(TARGET_DIR)/dnb.index.tar.xz: $(TARGET_DIR)/dnb.index
tar -I 'xz -T0' -C $(dir $<) -cf $@ $(notdir $<)
deploy: $(TARGET_DIR)/dnb.index.tar.xz korap4dnb-compose.yml
rsync -v $^ $(DEPLOY_USER)@$(DEPLOY_HOST):$(DEPLOY_PATH)/
ssh $(DEPLOY_USER)@$(DEPLOY_HOST) "mkdir -p $(DEPLOY_PATH) && cd $(DEPLOY_PATH) && docker compose -p korap4dnb --profile=lite -f $(notdir $(word 2,$^)) up -d --dry-run && docker compose -p korap4dnb stop && (mv -f dnb.index dnb.index.bak || true) && tar Jxvf $(notdir $<) && docker compose -p korap4dnb --profile=lite -f $(notdir $(word 2,$^)) up -d"
show-server-log:
ssh $(DEPLOY_USER)@$(DEPLOY_HOST) "cd $(DEPLOY_PATH) && docker compose -p korap4dnb --profile=lite -f korap4dnb-compose.yml logs -f"
show-server-status:
ssh $(DEPLOY_USER)@$(DEPLOY_HOST) "cd $(DEPLOY_PATH) && docker compose -p korap4dnb --profile=lite -f korap4dnb-compose.yml ps"
clean:
rm -rf $(BUILD_DIR) $(TARGET_DIR)