diff --git a/qendpoint-core/src/main/java/com/the_qa_company/qendpoint/core/hdt/HDTManagerImpl.java b/qendpoint-core/src/main/java/com/the_qa_company/qendpoint/core/hdt/HDTManagerImpl.java index 2447f31c3..1076f26b3 100644 --- a/qendpoint-core/src/main/java/com/the_qa_company/qendpoint/core/hdt/HDTManagerImpl.java +++ b/qendpoint-core/src/main/java/com/the_qa_company/qendpoint/core/hdt/HDTManagerImpl.java @@ -13,6 +13,7 @@ import com.the_qa_company.qendpoint.core.header.HeaderUtil; import com.the_qa_company.qendpoint.core.listener.ProgressListener; import com.the_qa_company.qendpoint.core.options.HDTOptions; +import com.the_qa_company.qendpoint.core.options.HDTOptionsFile; import com.the_qa_company.qendpoint.core.options.HDTOptionsKeys; import com.the_qa_company.qendpoint.core.options.HDTSpecification; import com.the_qa_company.qendpoint.core.rdf.RDFFluxStop; @@ -192,6 +193,10 @@ public HDTResult doGenerateHDT(String rdfFileName, String baseURI, RDFNotation r long checksum = 0; try { if (spec.getBoolean(HDTOptionsKeys.LOADER_PREDOWNLOAD_URL) && IOUtil.isRemoteURL(rdfFileName)) { + spec = spec.pushTop(); + spec.set(HDTOptionsKeys.LOADER_PREDOWNLOAD_URL, false); // disable + // for + // cattree long retry = spec.getInt(HDTOptionsKeys.LOADER_PREDOWNLOAD_URL_RETRY, 1); final String rdfFileName2 = rdfFileName; @@ -202,14 +207,42 @@ public HDTResult doGenerateHDT(String rdfFileName, String baseURI, RDFNotation r throw new RuntimeException(e); } }); + Path dlFile = preDownload.resolveSibling(preDownload.getFileName() + ".download"); + HDTOptionsFile opFile = new HDTOptionsFile(dlFile); long tryCount = 1; checksumPath = spec.getPath(HDTOptionsKeys.LOADER_PREDOWNLOAD_CHECKSUM_PATH); while (true) { - listener.notifyProgress(0, - "predownload " + rdfFileName + " into " + preDownload + " try #" + tryCount); - InputStream readIs = IOUtil.getFileInputStream(rdfFileName, false); - try (InputStream is = checksumPath != null ? new CRCInputStream(readIs, new CRC32()) : readIs; + opFile.sync(); + long trueSize = IOUtil.getContentLengthLong(rdfFileName); + long lastLength = opFile.getOptions().getInt("last-length", 0); + long preSize; + + if (lastLength != trueSize) { + preSize = 0; + } else { + try { + preSize = Files.size(preDownload); + } catch (IOException ignore) { + preSize = 0; + } + } + opFile.getOptions().set("last-length", trueSize); + opFile.save(); + + listener.notifyProgress(0, "predownload " + rdfFileName + " into " + preDownload + " try #" + + tryCount + " / preSize " + preSize); + + IOUtil.HTTPData readIs = IOUtil.getFileInputStreamData(rdfFileName, false, preSize); + long trueCurrSize = readIs.conn().getContentLengthLong(); + if (trueCurrSize > 0 && trueSize != trueCurrSize) { + logger.error("Find bad file sizes compared to the previous calls, {} != {}", trueCurrSize, + trueSize); + continue; // try again, can it create an infinite loop + // is the server is bad? + } + InputStream stream = readIs.is(); + try (InputStream is = checksumPath != null ? new CRCInputStream(stream, new CRC32()) : stream; OutputStream os = new BufferedOutputStream(Files.newOutputStream(preDownload))) { IOUtil.copy(is, os, listener, 10_000_000); if (is instanceof CRCInputStream crcIs) { diff --git a/qendpoint-core/src/main/java/com/the_qa_company/qendpoint/core/options/HDTOptionsFile.java b/qendpoint-core/src/main/java/com/the_qa_company/qendpoint/core/options/HDTOptionsFile.java new file mode 100644 index 000000000..8f93c5f39 --- /dev/null +++ b/qendpoint-core/src/main/java/com/the_qa_company/qendpoint/core/options/HDTOptionsFile.java @@ -0,0 +1,76 @@ +package com.the_qa_company.qendpoint.core.options; + +import com.the_qa_company.qendpoint.core.compact.integer.VByte; +import com.the_qa_company.qendpoint.core.listener.ProgressListener; +import com.the_qa_company.qendpoint.core.util.crc.CRC32; +import com.the_qa_company.qendpoint.core.util.crc.CRCInputStream; +import com.the_qa_company.qendpoint.core.util.crc.CRCOutputStream; +import com.the_qa_company.qendpoint.core.util.io.IOUtil; + +import java.io.BufferedInputStream; +import java.io.BufferedOutputStream; +import java.io.IOException; +import java.nio.file.Files; +import java.nio.file.Path; +import java.util.Objects; +import java.util.Set; + +public class HDTOptionsFile { + public static final long MAGIC = 0x4f464e4c44544448L; + private HDTOptions options = HDTOptions.of(); + + private final Path location; + + public HDTOptionsFile(Path location) { + this.location = location; + } + + public HDTOptions getOptions() { + return options; + } + + public void sync() throws IOException { + HDTOptions options = HDTOptions.of(); + + if (!Files.exists(location)) { + this.options = options; + return; + } + ProgressListener l = ProgressListener.ignore(); + try (CRCInputStream is = new CRCInputStream(new BufferedInputStream(Files.newInputStream(location)), + new CRC32())) { + if (IOUtil.readLong(is) != MAGIC) + throw new IOException("Can't read HDTOptions file: Bad magic"); + + long count = VByte.decode(is); + for (long i = 0; i < count; i++) { + String key = IOUtil.readSizedString(is, l); + String val = IOUtil.readSizedString(is, l); + if (!val.isEmpty()) { + options.set(key, val); + } + } + if (!is.readCRCAndCheck()) { + throw new IOException("Can't read HDTOptions file: Bad CRC"); + } + } + this.options = options; + } + + public void save() throws IOException { + ProgressListener l = ProgressListener.ignore(); + try (CRCOutputStream os = new CRCOutputStream(new BufferedOutputStream(Files.newOutputStream(location)), + new CRC32())) { + IOUtil.writeLong(os, MAGIC); + Set keys = options.getKeys(); + VByte.encode(os, keys.size()); + for (Object k : keys) { + String key = String.valueOf(k); + String val = Objects.requireNonNull(options.get(key), ""); + IOUtil.writeSizedString(os, key, l); + IOUtil.writeSizedString(os, val, l); + } + os.writeCRC(); + } + } +} diff --git a/qendpoint-core/src/main/java/com/the_qa_company/qendpoint/core/util/io/IOUtil.java b/qendpoint-core/src/main/java/com/the_qa_company/qendpoint/core/util/io/IOUtil.java index 92b072864..6353515a4 100644 --- a/qendpoint-core/src/main/java/com/the_qa_company/qendpoint/core/util/io/IOUtil.java +++ b/qendpoint-core/src/main/java/com/the_qa_company/qendpoint/core/util/io/IOUtil.java @@ -44,6 +44,7 @@ import java.io.InputStream; import java.io.InputStreamReader; import java.io.OutputStream; +import java.net.HttpURLConnection; import java.net.URI; import java.net.URISyntaxException; import java.net.URL; @@ -370,8 +371,14 @@ public static InputStream getFileInputStream(String fileName) throws IOException } public static InputStream getFileInputStream(String fileName, boolean uncompress) throws IOException { + return getFileInputStream(fileName, uncompress, -1); + } + + public static InputStream getFileInputStream(String fileName, boolean uncompress, long startLen) + throws IOException { InputStream input; String name = fileName.toLowerCase(); + boolean skipHandled = startLen <= 0; if (isRemoteURL(fileName)) { URL url; try { @@ -380,6 +387,10 @@ public static InputStream getFileInputStream(String fileName, boolean uncompress throw new IOException("Invalid URI", e); } URLConnection con = url.openConnection(); + if (startLen > 0 && con instanceof HttpURLConnection http) { + http.setRequestProperty("Range", "bytes=" + startLen + "-"); + skipHandled = true; + } con.connect(); input = con.getInputStream(); } else if (name.equals("-")) { @@ -387,6 +398,9 @@ public static InputStream getFileInputStream(String fileName, boolean uncompress } else { input = new BufferedInputStream(new FileInputStream(fileName)); } + if (!skipHandled) { + input.skipNBytes(startLen); + } if (uncompress) { if (name.endsWith(".gz") || name.endsWith(".tgz")) { @@ -400,6 +414,71 @@ public static InputStream getFileInputStream(String fileName, boolean uncompress return input; } + public record HTTPData(InputStream is, HttpURLConnection conn) implements Closeable { + @Override + public void close() throws IOException { + is.close(); + } + } + + public static HTTPData getFileInputStreamData(String fileName, boolean uncompress, long startLen) + throws IOException { + InputStream input; + String name = fileName.toLowerCase(); + if (!isRemoteURL(fileName)) { + throw new IOException("Not a http connection"); + } + URL url; + try { + url = new URI(fileName).toURL(); + } catch (URISyntaxException e) { + throw new IOException("Invalid URI", e); + } + + if (!url.getProtocol().equalsIgnoreCase("http") && !url.getProtocol().equalsIgnoreCase("https")) { + throw new IOException("Not a http connection"); + } + HttpURLConnection con = (HttpURLConnection) url.openConnection(); + if (startLen > 0) { + con.setRequestProperty("Range", "bytes=" + startLen + "-"); + } + con.connect(); + input = con.getInputStream(); + + if (uncompress) { + if (name.endsWith(".gz") || name.endsWith(".tgz")) { + input = new GZIPInputStream(input); + } else if (name.endsWith("bz2") || name.endsWith("bz")) { + input = new BZip2CompressorInputStream(input, true); + } else if (name.endsWith("xz")) { + input = new XZCompressorInputStream(input, true); + } + } + return new HTTPData(input, con); + } + + public static long getContentLengthLong(String filename) throws IOException { + if (!isRemoteURL(filename)) { + return Files.size(Path.of(filename)); + } + if (filename.equals("-")) { + return 0; + } + URL url; + try { + url = new URI(filename).toURL(); + } catch (URISyntaxException e) { + throw new IOException("Invalid URI", e); + } + if (!url.getProtocol().equalsIgnoreCase("https") && !url.getProtocol().equalsIgnoreCase("http")) { + return 0; + } + HttpURLConnection con = (HttpURLConnection) url.openConnection(); + con.setRequestMethod("HEAD"); + con.connect(); + return con.getContentLengthLong(); + } + public static BufferedReader getFileReader(String fileName) throws IOException { return new BufferedReader(new InputStreamReader(getFileInputStream(fileName))); } diff --git a/qendpoint-core/src/test/java/com/the_qa_company/qendpoint/core/util/io/IOUtilTest.java b/qendpoint-core/src/test/java/com/the_qa_company/qendpoint/core/util/io/IOUtilTest.java index cb1a193ef..58e3b35b5 100644 --- a/qendpoint-core/src/test/java/com/the_qa_company/qendpoint/core/util/io/IOUtilTest.java +++ b/qendpoint-core/src/test/java/com/the_qa_company/qendpoint/core/util/io/IOUtilTest.java @@ -1,17 +1,22 @@ package com.the_qa_company.qendpoint.core.util.io; +import static org.junit.Assert.assertArrayEquals; import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertTrue; import static org.junit.Assert.fail; import java.io.ByteArrayInputStream; import java.io.ByteArrayOutputStream; import java.io.Closeable; import java.io.IOException; +import java.io.InputStream; import java.nio.file.Files; import java.nio.file.Path; +import java.util.Arrays; import org.junit.Assert; import org.junit.Before; +import org.junit.Ignore; import org.junit.Rule; import org.junit.Test; import org.junit.rules.TemporaryFolder; @@ -66,16 +71,16 @@ public void testWriteInt() { ByteArrayInputStream bin = new ByteArrayInputStream(bout.toByteArray()); - long a = IOUtil.readInt(bin); + int a = IOUtil.readInt(bin); assertEquals(a, 3); - long b = IOUtil.readInt(bin); + int b = IOUtil.readInt(bin); assertEquals(b, 4); - long c = IOUtil.readInt(bin); + int c = IOUtil.readInt(bin); assertEquals(c, 0xFF0000AA); - long d = IOUtil.readInt(bin); + int d = IOUtil.readInt(bin); assertEquals(d, 0xAABBCCDD); } catch (IOException e) { @@ -137,4 +142,48 @@ public void closeablePathTest() throws IOException { Assert.assertFalse(Files.exists(p2)); } + + @Ignore("Hand test") + @Test + public void urlTest() throws IOException { + final String url = "https://dumps.wikimedia.org/wikidatawiki/entities/latest-all.ttl.bz2"; + + long len = IOUtil.getContentLengthLong(url); + + assertTrue("bad len: " + len, len > 0); + + System.out.println(len); + + byte[] read; + final int toRead = 0x1000; + try (InputStream is = IOUtil.getFileInputStream(url, false)) { + read = is.readNBytes(toRead); + } + + assertEquals(toRead, read.length); + + byte[] read2; + try (InputStream is = IOUtil.getFileInputStream(url, false)) { + read2 = is.readNBytes(toRead); + } + + assertArrayEquals(read, read2); + + byte[] read3; + int midRead = 0x500; + try (InputStream is = IOUtil.getFileInputStream(url, false)) { + is.skipNBytes(midRead); + read3 = is.readNBytes(midRead); + } + byte[] read4; + try (InputStream is = IOUtil.getFileInputStream(url, false, midRead)) { + read4 = is.readNBytes(midRead); + } + + assertArrayEquals(read3, read4); + + byte[] read5 = Arrays.copyOfRange(read, midRead, midRead + midRead); + + assertArrayEquals(read3, read5); + } }