Views: 48
今週5月8日にペポルオーソリティ(デジ庁)のページ
https://www.digital.go.jp/policies/electronic_invoice/
にJP PINT 0.9.1の更新が反映されました。
公開されたページをひとつひとつ開いて確認するのもよいのですが、一覧性に欠けるのでPython3+BeautifulSoup4でページから定義情報を抽出するプログラムを作成しました。
#!/usr/bin/env python3
#
# genarate JSON and CSV from PINT and JP PINT 0.9.1
#
# designed by SAMBUICHI, Nobuyuki (Sambuichi Professional Engineers Office)
# written by SAMBUICHI, Nobuyuki (Sambuichi Professional Engineers Office)
#
# MIT License
#
# Copyright (c) 2022 SAMBUICHI Nobuyuki (Sambuichi Professional Engineers Office)
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
# The above copyright notice and this permission notice shall be included in all
# copies or substantial portions of the Software.
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.
#
from bs4 import BeautifulSoup
import bs4
from urllib import request
import urllib
import os
import sys
import ssl
import json
import csv
ssl._create_default_https_context = ssl._create_unverified_context
pint_url = 'https://test-docs.peppol.eu/poacc/pint/pint02/pint/'
pint_semantic_url = 'https://test-docs.peppol.eu/poacc/pint/pint02/pint/trn-invoice/semantic-model/'
pint_syntax_url = 'https://test-docs.peppol.eu/poacc/pint/pint02/pint/trn-invoice/syntax/'
root_url = 'https://test-docs.peppol.eu/pint/pint-jp/work-v1/'
jp_semantic_url = 'https://test-docs.peppol.eu/pint/pint-jp/work-v1/pint-jp/trn-invoice/semantic-model/'
jp_syntax_url = 'https://test-docs.peppol.eu/pint/pint-jp/work-v1/pint-jp/trn-invoice/syntax/'
# root_rule_url = 'https://test-docs.peppol.eu/pint/pint-jp/work-v1/pint-jp/trn-invoice/rule/'
# shared_rule_url = 'https://test-docs.peppol.eu/pint/pint-jp/work-v1/pint-jp/trn-invoice/rule/PINT-UBL-validation-preprocessed/'
# aligned_rule_url = 'https://test-docs.peppol.eu/pint/pint-jp/work-v1/pint-jp/trn-invoice/rule/PINT-jurisdiction-aligned-rules/'
def parse_semantic(PINT,base_url,out_file,ITEM):
try:
response = request.urlopen(base_url)
soup = BeautifulSoup(response, "lxml")
response.close()
#
tr_s = soup.find_all('tr')
header = [x.text.strip() for x in tr_s[0].find_all('th')]
results = []
for i in range(len(tr_s)-1):
item = [x.text.strip() for x in tr_s[i+1].find_all('td')]
id = item[0].lower()
term = item[1]
t = term.replace("•\xa0",'_')
t1 = t.replace('_ ','_')
t2 = t1.replace('_','')
level = len(t1) - len(t2)
section = item[2]
card = item[3]
desc = item[4]
data = {}
data[header[0]] = id
data['level'] = level
data[header[1]] = t2
data[header[2]] = section
data[header[3]] = card
data[header[4]] = desc
if 'PINT'==PINT:
item_url = f'{pint_semantic_url}{id}/'
elif 'JP_PINT'==PINT:
item_url = f'{jp_semantic_url}{id}/'
data['item_url'] = item_url
response = request.urlopen(item_url)
soup = BeautifulSoup(response, "lxml")
response.close()
dl = soup.find('dl')
title = [x.text for x in dl.find_all('dt')]
value = [x for x in dl.find_all('dd')]
for idx in range(len(title)):
data[title[idx]] = value[idx].text
Syntaxbinding = value[-1]
Syntaxbindings = Syntaxbinding.find_all('a')
path_url = ''
if len(Syntaxbindings) > 0:
if 'ubl:' in Syntaxbinding.text:
Syntaxbinding = '/'+Syntaxbinding.text.strip().replace(' / ','/')
else:
Syntaxbinding = ''
data['Syntax binding'] = Syntaxbinding
href = Syntaxbindings[-1].attrs['href']
if 'PINT'==PINT:
path_url = f"{pint_syntax_url[:-36]}{href.replace('../','')}"
elif 'JP_PINT'==PINT:
path_url = f"{jp_syntax_url[:-27]}{href.replace('../','')}"
else:
Syntaxbinding = ''
data['Syntax binding'] = Syntaxbinding
data['path_url'] = path_url
results.append(data)
print(f'{i} {id} {level} {t2} {Syntaxbinding}')
with open(out_file, 'w') as f:
json.dump(results, f, indent=4)
print(f'write {ITEM} {out_file}')
except urllib.error.HTTPError as err:
print("WARN", err.code, base_url, file=sys.stderr)
return False
except urllib.error.URLError as err:
print("ERROR", err.reason, base_url, file=sys.stderr)
return False
def semantic2csv(out_file,csv_file,ITEM):
with open(out_file, 'r') as f:
results = json.load(f)
keys = ['Id','level','Business Term','Section','Card.','Definition','item_url','Cardinality','Semantic datatype','Name','Syntax binding','path_url']
with open(csv_file, 'w', newline='') as output_file:
dict_writer = csv.DictWriter(output_file, keys)
dict_writer.writeheader()
dict_writer.writerows(results)
print(f'write {ITEM} {csv_file}')
def parse_syntax(PINT,base_url,out_file,ITEM):
try:
response = request.urlopen(base_url)
soup = BeautifulSoup(response, "lxml")
response.close()
tr_s = soup.find_all('tr')
header = [x.text.strip() for x in tr_s[0].find_all('th')]
results = []
for i in range(len(tr_s)-1):
card = None
level = None
element_text = None
id = None
businessterm = None
item = [x for x in tr_s[i+1].find_all('td')]
card = item[0].text.strip()
element = item[1]
text = element.text.strip()
t = text.replace("•\xa0",'_')
t1 = t.replace('_ ','_')
t2 = t1.replace('_','')
level = len(t1) - len(t2)
data = {}
data['card'] = card
data['level'] = level
element_text = t2.strip()
if '\n' in element_text:
element_text =element_text.replace('\n','')
data['element'] = element_text
name = item[2].text.strip()
if name:
if '\n' in name:
term = name[:name.index('\n')].strip()
desc = name[name.index('\n')+1:].strip()
data['term'] = term
data['desc'] = desc
else:
data['term'] = name
if element:
el = element_text.replace(':','-')
if 'ubl-Invoice'==el:
data['level'] = None
else:
href = element.find_all('a')[0].attrs['href']
if 'PINT'==PINT:
path = f"{pint_syntax_url}{href}"
path_url = path.replace('pint/trn-invoice/syntax/../../../','')
elif 'JP_PINT'==PINT:
path = f"{jp_syntax_url}{href}"
path_url = path.replace('pint-jp/trn-invoice/syntax/../../../','')
data['path_url'] = path_url
response = request.urlopen(path_url)
soup = BeautifulSoup(response, "lxml")
response.close()
dl = soup.find('dl')
title = [x.text for x in dl.find_all('dt')]
value = [x for x in dl.find_all('dd')]
for idx in range(len(title)):
data[title[idx]] = value[idx].text.strip()
if 'Business Term'==title[idx]:
bt = data[title[idx]]
bt = bt.replace('IBG','ibg')
bt = bt.replace('IBT','ibt')
data[title[idx]] = bt
if '\n' in bt:
bt = bt.split('\n')[1]
id = bt[:bt.index(' - ')].strip().lower()
businessterm = bt[bt.index(' - ')+3:].strip()
data['id'] = id
data['businessterm'] = businessterm
print(f'{i} {card} {level} {element_text} {id} {businessterm}')
results.append(data)
with open(out_file, 'w') as f:
json.dump(results, f, indent=4)
print(f'write {ITEM} {out_file}')
except urllib.error.HTTPError as err:
url = base_url or path
print("WARN", err.code, url, file=sys.stderr)
return False
except urllib.error.URLError as err:
url = base_url or path
print("ERROR", err.reason, url, file=sys.stderr)
return False
def syntax2csv(out_file,csv_file,ITEM):
with open(out_file, 'r') as f:
results = json.load(f)
keys = ["card","level","element","term","desc","path_url","Attribute","Cardinality","Element","Namespace","Selector","Section","Business Term","id","businessterm"]
with open(csv_file, 'w', newline='') as output_file:
dict_writer = csv.DictWriter(output_file, keys)
dict_writer.writeheader()
for row in results:
if 'term' in row and '\n' in row['term']:
term = row['term'].replace('\n','\\n')
row['term'] = term
if 'desc' in row and '\n' in row['desc']:
desc = row['desc'].replace('\n','\\n')
row['desc'] = desc
if 'Business Term' in row and '\n' in row['Business Term']:
BusinessTerm = row['Business Term'].replace('\n','\\n')
row['Business Term'] = BusinessTerm
if 'businessterm' in row and '\n' in row['businessterm']:
businessterm = row['businessterm'].replace('\n','\\n')
row['businessterm'] = businessterm
dict_writer.writerow(row)
print(f'write {ITEM} {csv_file}')
def parse_rule(base_url, ITEM):
pass
def main():
dir = os.path.dirname(__file__)
ITEM = 'Semantic'
PINT = 'PINT'
out_file = os.path.join(dir, f'{PINT}_{ITEM}.json')
csv_file = os.path.join(dir, f'{PINT}_{ITEM}.csv')
parse_semantic(PINT,pint_semantic_url,out_file,ITEM)
semantic2csv(out_file,csv_file,ITEM)
PINT = 'JP_PINT'
out_file = os.path.join(dir, f'{PINT}_{ITEM}.json')
csv_file = os.path.join(dir, f'{PINT}_{ITEM}.csv')
parse_semantic(PINT,jp_semantic_url,out_file,ITEM)
semantic2csv(out_file,csv_file,ITEM)
ITEM = 'Syntax'
PINT = 'PINT'
out_file = os.path.join(dir, f'{PINT}_{ITEM}.json')
csv_file = os.path.join(dir, f'{PINT}_{ITEM}.csv')
parse_syntax(PINT,pint_syntax_url,out_file,ITEM)
syntax2csv(out_file,csv_file,ITEM)
PINT = 'JP_PINT'
out_file = os.path.join(dir, f'{PINT}_{ITEM}.json')
csv_file = os.path.join(dir, f'{PINT}_{ITEM}.csv')
parse_syntax(PINT,jp_syntax_url,out_file,ITEM)
syntax2csv(out_file,csv_file,ITEM)
if __name__ == '__main__':
main()
CSV
JP PINT 0.9.1
JP_PINT_Semantic
JP_PINT_Syntax

