import glob import os import sys import filing from nitpo import Nitpo from docids import Docids from infile import Infile from lxml.builder import ElementMaker import lxml.etree as etree class Itresu(Nitpo): def __init__(self, do_verbose=False): """build the itresu""" super().__init__() if not self.has_conf('folders', 'itresu'): print("I need an itresu folder.", file=sys.stderr) sys.exit() self.N = "{%s}" % self.const['ns'] self.E = ElementMaker(nsmap={None: self.const['ns']}) self.itresu_fudi = self.conf['folders']['itresu'] self.repis_fudi = self.conf['folders']['repis'] self.docids_fudi = self.conf['folders']['docids'] self.repis_fudi = self.conf['folders']['repis'] self.docids = Docids(do_verbose=do_verbose) self.infile = Infile(do_verbose=do_verbose) # # docids in the repis we look at self.repis_docids = {} # # docids in the repis that are left after duplicates self.active_docids = {} # # duplicates, by repcode, initial state self.dups = None self.do_verbose = do_verbose def update(self, fufi): dups = self.get_dups(fufi) out_bana = self.infile.get_base(fufi) + '.json.gz' out_fufi = self.itresu_fudi + '/' + out_bana filing.prepare(out_fufi) filing.dump(dups, out_fufi, no_rewrite=True) # # Done! def get_dups(self, fufi, limit_repcodes=None): self.dups = {} issuedate = self.infile.get_issuedate(fufi) self.docids.update_issuedate(issuedate) docids_fufi = self.docids.get_out_fufi(fufi) for docid in self.docids.get_docids(docids_fufi): self.repis_docids[docid] = 1 # # active docids that remain after duplication self.active_docids[docid] = 1 docids_fudi = os.path.dirname(docids_fufi) fufis = self.infile.list_by_prior(docids_fudi) limit_prior = self.infile.get_prior(fufi) for prior_fufi in fufis: prior = self.infile.get_prior(prior_fufi) if prior == limit_prior: break # # augments self.dups if limit_repcodes is not None: repcode = self.infile.get_repcode(prior_fufi) if repcode not in limit_repcodes: continue self.add_prior_fufi(prior_fufi, limit_repcodes=limit_repcodes) if len(self.active_docids) == 0: # # return None to say nothing to do return None return self.dups def add_prior_fufi(self, fufi, limit_repcodes=None): repcode = self.infile.get_repcode(fufi) if limit_repcodes is not None: if repcode not in limit_repcodes: return self.dups docids = self.docids.get_docids(fufi) for docid in docids: if docid not in self.repis_docids: continue if repcode not in self.dups: self.dups[repcode] = [] self.dups[repcode].append(docid) if docid in self.active_docids: del self.active_docids[docid] return self.dups def xml(self, fufi, repcodes): """form the XML for itresu limited to repcodes""" if self.dups is None: self.get_dups(fufi) dups_ele = self.E('dups') all_docids = {} for repcode in self.dups: if repcodes is not None and not repcode not in repcodes: continue # prior repcode prirp_ele = etree.SubElement(dups_ele, self.N + 'prirp') prirp_ele.attrib['repcode'] = repcode for docid in self.dups[repcode]: all_docids[docid] = 1 dup_ele = etree.SubElement(dups_ele, self.N + 'dup') dup_ele.attrib['repcode'] = repcode dup_ele.attrib['docid'] = docid # print(etree.tostring(dups_ele, pretty_print=True).decode()) total_omitted = len(all_docids.keys()) if total_omitted > 0: count_ele = etree.SubElement(dups_ele, self.N + 'count') count_ele.text = str(total_omitted) return dups_ele