"""deals with the incoming report issue or docid files""" import os import glob import sys from nitpo import Nitpo class Infile(Nitpo): def __init__(self, do_verbose=False): super().__init__() ## an extenstion for the repis file has to be set if not self.has_conf('ext', 'repis'): print("infile needs Ф[ext][repis].", file=sys.stderr) sys.exit() ## the file has to be in the repis folder if not self.has_conf('folders', 'repis'): print("infile needs Ф[folders][repis].", file=sys.stderr) sys.exit() self.len_ext = len(self.conf['ext']['repis']) ## the repis seperator serperate the repcode from the priority ## indicator prior if not self.has_conf('chars', 'repisep'): print("infile needs Ф[char][repisep]", file=sys.stderr) sys.exit() self.repisep = self.conf['chars']['repisep'] self.do_verbose = do_verbose def bana_chop_ext(self, fufi): bana = os.path.basename(fufi) len_ext = self.len_ext if bana.endswith('.gz'): bana = bana[0:-3] bana = bana[0:-len_ext] return bana def get_repcode(self, fufi): """the repcode is the file base name before the repisep""" bana = os.path.basename(fufi) parts = bana.partition(self.repisep) return parts[0] def get_prior(self, fufi): """In fact this is only used on docid files.""" bana = os.path.basename(fufi) parts = bana.partition(self.repisep) after_repcode = parts[2] count_chop = self.len_ext if after_repcode.endswith('.gz'): count_chop += 3 ## this is used for docid files, not gzipped elif after_repcode.endswith('.txt'): count_chop = 4 prior = after_repcode[0:-count_chop] return prior def is_repis(self, fufi): """really: is this a repis file""" test_fufi = fufi if fufi.endswith('.gz'): test_fufi = fufi[0:-3] ## check that the fufi is in [folders][repis] if not fufi.startswith(self.conf['folders']['repis']): note = f"infile; {fufi} is not in the repis folder " note += f" {self.conf['folders']['repis']}" note += f" infile skips it." print(note) return False if test_fufi.endswith(self.conf['ext']['repis']): return True return False def get_issuedate(self, fufi): """the issue date is the directory above the location of the repis file""" return os.path.basename(os.path.dirname(fufi)) def get_base(self, fufi): """the base is the issuedate plus the base name, without extension""" issuedate = self.get_issuedate(fufi) bana = self.bana_chop_ext(fufi) base = issuedate + '/' + bana return base def is_duplicate(self, fufi): """find if a file with same repcode exists""" fudi = os.path.dirname(fufi) repcode = self.get_repcode(fufi) glob_string = f"{fudi}/*{repcode}*" fufis = glob.glob(glob_string) if len(fufis) == 0: return False return True def list_by_prior(self, fudi, do_show=False): """for repis and docids files""" glob_string = f"{fudi}/*" fufis = glob.glob(glob_string) fufi_list = [] priors = {} for fufi in fufis: if not self.is_known_type(fufi): print(f"infile: unkown file type {fufi}", file=sys.stderr) continue priors[fufi] = self.get_prior(fufi) if len(priors) == 0: return [] fufi_list = sorted(fufis, key=priors.get) if do_show: for fufi in fufi_list: print(fufi) return fufi_list def is_known_type(self, fufi): """known types of inputs used""" test_fufi = fufi if fufi.endswith('.gz'): test_fufi = fufi[-3] ## repis files if test_fufi.endswith('.xml'): return True ## docid files if test_fufi.endswith('.txt'): return True # # not used #if test_fufi.endswith('.json'): # return True #if test_fufi.endswith('.mail'): # return True return False