# import glob import os import sys from lxml.builder import ElementMaker import filing from nitpo import Nitpo from docids import Docids from infile import Infile import lxml.etree as etree class Itresu(Nitpo): def __init__(self, do_verbose=False): """build the itresu""" super().__init__() if not self.has_conf('folders', 'itresu'): print("I need an itresu folder.", file=sys.stderr) sys.exit() self.N = "{%s}" % self.const['ns'] self.E = ElementMaker(nsmap={None: self.const['ns']}) self.itresu_fudi = self.conf['folders']['itresu'] # self.repis_fudi = self.conf['folders']['repis'] # self.docids_fudi = self.conf['folders']['docids'] self.docids = Docids(do_verbose=do_verbose) self.infile = Infile(do_verbose=do_verbose) # # docids in the repis we look at self.repis_docids = {} # # docids in the repis that are left after duplicates self.active_docids = {} # # duplicates, by repcode, initial state self.dups = None self.do_verbose = do_verbose def update(self, fufi, do_write=False): """write itresu into a file""" dups = self.get_dups(fufi) out_bana = self.infile.get_base(fufi) + '.json.gz' out_fufi = self.itresu_fudi + '/' + out_bana filing.prepare(out_fufi) if not do_write: return self.dups filing.dump(dups, out_fufi, no_rewrite=True) # # Done! def get_dups(self, fufi): """duplicates. to be done at the repis parsing time""" self.dups = {} issuedate = self.infile.get_issuedate(fufi) self.docids.update_issuedate(issuedate) docids_fufi = self.docids.get_out_fufi(fufi) for docid in self.docids.get_docids(docids_fufi): self.repis_docids[docid] = 1 docids_fudi = os.path.dirname(docids_fufi) fufis = self.infile.list_by_prior(docids_fudi) limit_prior = self.infile.get_prior(fufi) for prior_fufi in fufis: prior = self.infile.get_prior(prior_fufi) if prior == limit_prior: # # last file to parse reached break self.add_prior_fufi(prior_fufi) return self.dups def add_prior_fufi(self, fufi): repcode = self.infile.get_repcode(fufi) docids = self.docids.get_docids(fufi) for docid in docids: if docid not in self.repis_docids: continue if repcode not in self.dups: self.dups[repcode] = [] self.dups[repcode].append(docid) if docid in self.active_docids: del self.active_docids[docid] # print(self.dups) return self.dups def are_any_left(self, fufi, repcodes): """do we have docids after removing repcodes""" docids = self.docids.of_repis(fufi) if self.dups is None: raise Exception('dups must be set') covered_docids = {} for repcode in self.dups: if repcodes is not None and repcode not in repcodes: continue for docid in self.dups[repcode]: covered_docids[docid] = 1 len_docids = len(docids) len_covered = len(covered_docids) if len_docids > len_covered: return True if len_docids == len_covered: return False raise Exception('should not be here') def xml(self, fufi, repcodes): """form the XML for itresu limited to repcodes, which can be None""" if self.dups is None: ## fills self.dups self.get_dups(fufi) dups_ele = self.E(self.N + 'dups') all_docids = {} for repcode in self.dups: if repcodes is not None and repcode not in repcodes: continue # prior repcode # prirp_ele = etree.SubElement(dups_ele, self.N + 'prirp') # prirp_ele.attrib['repcode'] = repcode for docid in self.dups[repcode]: all_docids[docid] = 1 dup_ele = etree.SubElement(dups_ele, self.N + 'dup') dup_ele.attrib['repcode'] = repcode dup_ele.attrib['docid'] = docid # print(etree.tostring(dups_ele, pretty_print=True).decode()) total_omitted = len(all_docids.keys()) # # the count could be done in XSLT, but it is easier done here if total_omitted > 0: count_ele = etree.SubElement(dups_ele, self.N + 'count') count_ele.text = str(total_omitted) return dups_ele