-
Notifications
You must be signed in to change notification settings - Fork 3
/
Copy pathcompressed_file_reader_test.py
executable file
·105 lines (88 loc) · 3.33 KB
/
compressed_file_reader_test.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
#!/usr/bin/env python
'''in which I show how to pass a file object pointing to uncompressed data
to a function and have it read compressed data--without reading it all into memory'''
import os
import zlib
import gzip
import struct
filename = 'test'
test_data = 'test data\nline two\nline three'*50
class CompressedFileReader(object):
'''
Wraps a file object and provides a read method that returns gzip'd data.
One warning: if read is called with a small value, the data returned may
be bigger than the value. In this case, the "compressed" data will be
bigger than the original data. To solve this, use a bigger read buffer.
An example use case:
Given an uncompressed file on disk, provide a way to read compressed data
without buffering the entire file data in memory. Using this class, an
uncompressed log file could be uploaded as compressed data with chunked
transfer encoding.
gzip header and footer code taken from the python stdlib gzip module
'''
def __init__(self, file_obj, compresslevel=9):
self._f = file_obj
self._compressor = zlib.compressobj(compresslevel,
zlib.DEFLATED,
-zlib.MAX_WBITS,
zlib.DEF_MEM_LEVEL,
0)
self.done = False
self.first = True
self.crc32 = 0
self.total_size = 0
def read(self, *a, **kw):
if self.done:
return ''
x = self._f.read(*a, **kw)
if x:
self.crc32 = zlib.crc32(x, self.crc32) & 0xffffffffL
self.total_size += len(x)
compressed = self._compressor.compress(x)
if not compressed:
compressed = self._compressor.flush(zlib.Z_SYNC_FLUSH)
else:
compressed = self._compressor.flush(zlib.Z_FINISH)
crc32 = struct.pack("<L", self.crc32 & 0xffffffffL)
size = struct.pack("<L", self.total_size & 0xffffffffL)
footer = crc32 + size
compressed += footer
self.done = True
if self.first:
self.first = False
header = '\037\213\010\000\000\000\000\000\002\377'
compressed = header + compressed
return compressed
def do_something(f):
size = 1024 # compression gets better as this goes up (don't set it too small)
buff = []
x = f.read(size)
while x:
buff.append(x)
x = f.read(size)
buff = ''.join(buff)
print 'original size:', len(test_data)
print 'compressed size:', len(buff)
print 'data matches:', test_data == zlib.decompress(buff, 16+zlib.MAX_WBITS)
return buff
# make a test file
f = open(filename, 'wb')
f.write(test_data)
f.close()
# using the uncompressed file, test the compressor wrapper
f = open(filename, 'rb')
compressed_f = CompressedFileReader(f)
compressed_data = do_something(compressed_f)
f.close()
# write the compressed data out (for the next test)
f = open(filename, 'wb')
f.write(compressed_data)
f.close()
# make sure the compressed data can be read by gzip (ensures tools like gunzip will work)
f = gzip.GzipFile(filename, 'rb')
d = f.read()
assert test_data == d
print 'Passed: compressed data was successfully read by gzip'
f.close()
# clean up
os.unlink(filename)