Views: 47
今週5月8日にペポルオーソリティ(デジ庁)のページ
https://www.digital.go.jp/policies/electronic_invoice/
にJP PINT 0.9.1の更新が反映されました。
公開されたページをひとつひとつ開いて確認するのもよいのですが、一覧性に欠けるのでPython3+BeautifulSoup4でページから定義情報を抽出するプログラムを作成しました。
#!/usr/bin/env python3 # # genarate JSON and CSV from PINT and JP PINT 0.9.1 # # designed by SAMBUICHI, Nobuyuki (Sambuichi Professional Engineers Office) # written by SAMBUICHI, Nobuyuki (Sambuichi Professional Engineers Office) # # MIT License # # Copyright (c) 2022 SAMBUICHI Nobuyuki (Sambuichi Professional Engineers Office) # # Permission is hereby granted, free of charge, to any person obtaining a copy # of this software and associated documentation files (the "Software"), to deal # in the Software without restriction, including without limitation the rights # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell # copies of the Software, and to permit persons to whom the Software is # furnished to do so, subject to the following conditions: # The above copyright notice and this permission notice shall be included in all # copies or substantial portions of the Software. # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE # SOFTWARE. # from bs4 import BeautifulSoup import bs4 from urllib import request import urllib import os import sys import ssl import json import csv ssl._create_default_https_context = ssl._create_unverified_context pint_url = 'https://test-docs.peppol.eu/poacc/pint/pint02/pint/' pint_semantic_url = 'https://test-docs.peppol.eu/poacc/pint/pint02/pint/trn-invoice/semantic-model/' pint_syntax_url = 'https://test-docs.peppol.eu/poacc/pint/pint02/pint/trn-invoice/syntax/' root_url = 'https://test-docs.peppol.eu/pint/pint-jp/work-v1/' jp_semantic_url = 'https://test-docs.peppol.eu/pint/pint-jp/work-v1/pint-jp/trn-invoice/semantic-model/' jp_syntax_url = 'https://test-docs.peppol.eu/pint/pint-jp/work-v1/pint-jp/trn-invoice/syntax/' # root_rule_url = 'https://test-docs.peppol.eu/pint/pint-jp/work-v1/pint-jp/trn-invoice/rule/' # shared_rule_url = 'https://test-docs.peppol.eu/pint/pint-jp/work-v1/pint-jp/trn-invoice/rule/PINT-UBL-validation-preprocessed/' # aligned_rule_url = 'https://test-docs.peppol.eu/pint/pint-jp/work-v1/pint-jp/trn-invoice/rule/PINT-jurisdiction-aligned-rules/' def parse_semantic(PINT,base_url,out_file,ITEM): try: response = request.urlopen(base_url) soup = BeautifulSoup(response, "lxml") response.close() # tr_s = soup.find_all('tr') header = [x.text.strip() for x in tr_s[0].find_all('th')] results = [] for i in range(len(tr_s)-1): item = [x.text.strip() for x in tr_s[i+1].find_all('td')] id = item[0].lower() term = item[1] t = term.replace("•\xa0",'_') t1 = t.replace('_ ','_') t2 = t1.replace('_','') level = len(t1) - len(t2) section = item[2] card = item[3] desc = item[4] data = {} data[header[0]] = id data['level'] = level data[header[1]] = t2 data[header[2]] = section data[header[3]] = card data[header[4]] = desc if 'PINT'==PINT: item_url = f'{pint_semantic_url}{id}/' elif 'JP_PINT'==PINT: item_url = f'{jp_semantic_url}{id}/' data['item_url'] = item_url response = request.urlopen(item_url) soup = BeautifulSoup(response, "lxml") response.close() dl = soup.find('dl') title = [x.text for x in dl.find_all('dt')] value = [x for x in dl.find_all('dd')] for idx in range(len(title)): data[title[idx]] = value[idx].text Syntaxbinding = value[-1] Syntaxbindings = Syntaxbinding.find_all('a') path_url = '' if len(Syntaxbindings) > 0: if 'ubl:' in Syntaxbinding.text: Syntaxbinding = '/'+Syntaxbinding.text.strip().replace(' / ','/') else: Syntaxbinding = '' data['Syntax binding'] = Syntaxbinding href = Syntaxbindings[-1].attrs['href'] if 'PINT'==PINT: path_url = f"{pint_syntax_url[:-36]}{href.replace('../','')}" elif 'JP_PINT'==PINT: path_url = f"{jp_syntax_url[:-27]}{href.replace('../','')}" else: Syntaxbinding = '' data['Syntax binding'] = Syntaxbinding data['path_url'] = path_url results.append(data) print(f'{i} {id} {level} {t2} {Syntaxbinding}') with open(out_file, 'w') as f: json.dump(results, f, indent=4) print(f'write {ITEM} {out_file}') except urllib.error.HTTPError as err: print("WARN", err.code, base_url, file=sys.stderr) return False except urllib.error.URLError as err: print("ERROR", err.reason, base_url, file=sys.stderr) return False def semantic2csv(out_file,csv_file,ITEM): with open(out_file, 'r') as f: results = json.load(f) keys = ['Id','level','Business Term','Section','Card.','Definition','item_url','Cardinality','Semantic datatype','Name','Syntax binding','path_url'] with open(csv_file, 'w', newline='') as output_file: dict_writer = csv.DictWriter(output_file, keys) dict_writer.writeheader() dict_writer.writerows(results) print(f'write {ITEM} {csv_file}') def parse_syntax(PINT,base_url,out_file,ITEM): try: response = request.urlopen(base_url) soup = BeautifulSoup(response, "lxml") response.close() tr_s = soup.find_all('tr') header = [x.text.strip() for x in tr_s[0].find_all('th')] results = [] for i in range(len(tr_s)-1): card = None level = None element_text = None id = None businessterm = None item = [x for x in tr_s[i+1].find_all('td')] card = item[0].text.strip() element = item[1] text = element.text.strip() t = text.replace("•\xa0",'_') t1 = t.replace('_ ','_') t2 = t1.replace('_','') level = len(t1) - len(t2) data = {} data['card'] = card data['level'] = level element_text = t2.strip() if '\n' in element_text: element_text =element_text.replace('\n','') data['element'] = element_text name = item[2].text.strip() if name: if '\n' in name: term = name[:name.index('\n')].strip() desc = name[name.index('\n')+1:].strip() data['term'] = term data['desc'] = desc else: data['term'] = name if element: el = element_text.replace(':','-') if 'ubl-Invoice'==el: data['level'] = None else: href = element.find_all('a')[0].attrs['href'] if 'PINT'==PINT: path = f"{pint_syntax_url}{href}" path_url = path.replace('pint/trn-invoice/syntax/../../../','') elif 'JP_PINT'==PINT: path = f"{jp_syntax_url}{href}" path_url = path.replace('pint-jp/trn-invoice/syntax/../../../','') data['path_url'] = path_url response = request.urlopen(path_url) soup = BeautifulSoup(response, "lxml") response.close() dl = soup.find('dl') title = [x.text for x in dl.find_all('dt')] value = [x for x in dl.find_all('dd')] for idx in range(len(title)): data[title[idx]] = value[idx].text.strip() if 'Business Term'==title[idx]: bt = data[title[idx]] bt = bt.replace('IBG','ibg') bt = bt.replace('IBT','ibt') data[title[idx]] = bt if '\n' in bt: bt = bt.split('\n')[1] id = bt[:bt.index(' - ')].strip().lower() businessterm = bt[bt.index(' - ')+3:].strip() data['id'] = id data['businessterm'] = businessterm print(f'{i} {card} {level} {element_text} {id} {businessterm}') results.append(data) with open(out_file, 'w') as f: json.dump(results, f, indent=4) print(f'write {ITEM} {out_file}') except urllib.error.HTTPError as err: url = base_url or path print("WARN", err.code, url, file=sys.stderr) return False except urllib.error.URLError as err: url = base_url or path print("ERROR", err.reason, url, file=sys.stderr) return False def syntax2csv(out_file,csv_file,ITEM): with open(out_file, 'r') as f: results = json.load(f) keys = ["card","level","element","term","desc","path_url","Attribute","Cardinality","Element","Namespace","Selector","Section","Business Term","id","businessterm"] with open(csv_file, 'w', newline='') as output_file: dict_writer = csv.DictWriter(output_file, keys) dict_writer.writeheader() for row in results: if 'term' in row and '\n' in row['term']: term = row['term'].replace('\n','\\n') row['term'] = term if 'desc' in row and '\n' in row['desc']: desc = row['desc'].replace('\n','\\n') row['desc'] = desc if 'Business Term' in row and '\n' in row['Business Term']: BusinessTerm = row['Business Term'].replace('\n','\\n') row['Business Term'] = BusinessTerm if 'businessterm' in row and '\n' in row['businessterm']: businessterm = row['businessterm'].replace('\n','\\n') row['businessterm'] = businessterm dict_writer.writerow(row) print(f'write {ITEM} {csv_file}') def parse_rule(base_url, ITEM): pass def main(): dir = os.path.dirname(__file__) ITEM = 'Semantic' PINT = 'PINT' out_file = os.path.join(dir, f'{PINT}_{ITEM}.json') csv_file = os.path.join(dir, f'{PINT}_{ITEM}.csv') parse_semantic(PINT,pint_semantic_url,out_file,ITEM) semantic2csv(out_file,csv_file,ITEM) PINT = 'JP_PINT' out_file = os.path.join(dir, f'{PINT}_{ITEM}.json') csv_file = os.path.join(dir, f'{PINT}_{ITEM}.csv') parse_semantic(PINT,jp_semantic_url,out_file,ITEM) semantic2csv(out_file,csv_file,ITEM) ITEM = 'Syntax' PINT = 'PINT' out_file = os.path.join(dir, f'{PINT}_{ITEM}.json') csv_file = os.path.join(dir, f'{PINT}_{ITEM}.csv') parse_syntax(PINT,pint_syntax_url,out_file,ITEM) syntax2csv(out_file,csv_file,ITEM) PINT = 'JP_PINT' out_file = os.path.join(dir, f'{PINT}_{ITEM}.json') csv_file = os.path.join(dir, f'{PINT}_{ITEM}.csv') parse_syntax(PINT,jp_syntax_url,out_file,ITEM) syntax2csv(out_file,csv_file,ITEM) if __name__ == '__main__': main()
CSV
JP PINT 0.9.1
JP_PINT_Semantic
JP_PINT_Syntax