import gzip from datetime import datetime import os import lxml.etree as etree # import glob import tempfile import filecmp # import re import errno import json # import shotiser import sys from shutil import copyfile # re_changed = re.compile('It\\s+was\\s+last\\s+changed' + # '\\s+on\\s+\\d{4}.\\d{2}.\\d{2}\\.\\s*') # re_whitespace = re.compile('\\s+') # # # # # a helper for the download # def temp_warc_path(): # pid = os.getpid() # tist = datetime.now().strftime('%s') # warc_path = '/tmp/' + str(pid) + '_' + tist # return warc_path # # def prepare(filename): if not os.path.exists(os.path.dirname(filename)): try: os.makedirs(os.path.dirname(filename)) except OSError as exc: # Guard against race condition if exc.errno != errno.EEXIST: raise def install_xml(ele, fufi): temp_file_name = tempfile.NamedTemporaryFile(delete=False).name et = etree.ElementTree(ele) et.write(temp_file_name, pretty_print=True) prepare(fufi) if(os.path.exists(fufi)): if(filecmp.cmp(temp_file_name, fufi, shallow=True)): os.remove(temp_file_name) return copyfile(temp_file_name, fufi) os.remove(temp_file_name) def load(fufi): if(fufi[-3:] != '.gz'): with open(fufi) as the_file: data = json.load(the_file) return data with gzip.GzipFile(fufi, 'r') as json_file: json_bytes = json_file.read() json_string = json_bytes.decode('utf-8') data = json.loads(json_string) return data def dump(a1 , a2): if(isinstance(a1, str)): fufi = a1 data = a2 else: fufi = a2 data = a1 if(fufi[-3:] != '.gz'): with open(fufi, 'w') as the_file: json.dump(data, the_file, indent=1) return fufi dump_string = json.dumps(data, ensure_ascii=False, indent=1).encode('utf-8') with gzip.GzipFile(fufi, 'w') as the_file: the_file.write(dump_string) del dump_string return fufi #def mtime(fufi): # mtime = os.path.getmtime(fufi) # mtime = int(mtime) # return mtime # # # def shoti(fufi): # mtime = os.path.getmtime(fufi) # mtime = int(mtime) # mshoti = shotiser.make(mtime) # return mshoti # # #def age(fufi): # mtime = os.path.getmtime(fufi) # now = datetime.now().strftime('%s') # age = int(now) - mtime # return age # # #def concat(canonic_fufi, supplem_fufi, do_verbose=False): # if(do_verbose): # print("I append " + supplem_fufi + ' to ' + canonic_fufi) # with open(canonic_fufi, 'ab') as canonic_file: # with open(supplem_fufi, 'rb') as supplem_file: # canonic_file.write(supplem_file.read()) # # #def bread(fufi): # if(not os.path.isfile(fufi)): # raise Exception(fufi + ' is not there.') # the_file = open(fufi, 'br') # string = the_file.read() # the_file.close() # return string # # def sread(fufi): if(not os.path.isfile(fufi)): raise Exception(fufi + ' is not there.') if(fufi[-3:] != '.gz'): with open(fufi, "r") as the_file: string = the_file.read() else: with gzip.GzipFile(fufi, 'r') as the_file: string = the_file.read() string = string.decode() the_file.close() return string #def brite(fufi, string, with_backup=False, do_verbose=False): # # # if exists, check for changes before writing # if(os.path.isfile(fufi)): # old_string = bread(fufi) # if(old_string == string): # if(do_verbose): # print("No change in " + fufi) # return False # the_file = open(fufi, 'bw') # the_file.write(string) # the_file.close() # if(do_verbose): # print("I write " + fufi) # return True def srite(fufi, string, do_backup=False, do_verbose=False, do_change_check=True, do_preserve_time=None): if (do_preserve_time is True): time = os.path.getmtime(fufi) elif isinstance(do_preserve_time, float): time = do_preserve_time elif isinstance(do_preserve_time, int): time = do_preserve_time # # if exists, check for changes before writing if do_change_check and os.path.isfile(fufi): old_string = sread(fufi) if(old_string == string): if(do_verbose): print("filer: I keep " + fufi) return False the_file = open(fufi, 'w') if(fufi[-3:] != '.gz'): with open(fufi, 'w') as the_file: the_file.write(string) else: with gzip.open(fufi, 'wb') as the_file: the_file.write(string.encode()) the_file.close() if do_verbose: print("filer: I wrote " + fufi) if not os.path.isfile(fufi): raise Exception(f"{fufi} should have been written.") if do_preserve_time is not None: os.utime(fufi, (time, time)) return True def donere(out_fufi, in_fufis, do_verbose=False, do_allow_empty=False): """does need renewal""" if not os.path.isfile(out_fufi): if do_verbose: print("filer.donore does not see " + out_fufi) return True out_info = os.stat(out_fufi) out_size = out_info.st_size if out_size == 0 and not do_allow_empty: if do_verbose: print(f'{out_fufi} is empty') return True out_mtime = out_info.st_mtime if not isinstance(in_fufis, list): raise Exception('filer.donere needs a list of in_fufis') for in_fufi in in_fufis: if not os.path.isfile(in_fufi): print("donere does not see the in_fufi " + in_fufi, file=sys.stderr) continue in_info = os.stat(in_fufi) in_mtime = in_info.st_mtime if in_mtime > out_mtime: if do_verbose: print(f'{out_fufi} older than {in_fufi}') return True if do_verbose: print("filer.donere skips " + out_fufi) return False