{"id":6233,"date":"2022-05-12T15:40:25","date_gmt":"2022-05-12T06:40:25","guid":{"rendered":"https:\/\/www.sambuichi.jp\/?p=6233"},"modified":"2022-08-11T12:29:05","modified_gmt":"2022-08-11T03:29:05","slug":"jp-pint-0-9-1%e3%82%92beautiful-soup%e3%81%a7%e8%a7%a3%e6%9e%90","status":"publish","type":"post","link":"https:\/\/www.sambuichi.jp\/?p=6233","title":{"rendered":"JP PINT 0.9.1\u3092Beautiful Soup\u3067\u89e3\u6790"},"content":{"rendered":"<p>Views: 49<\/p><p>\u4eca\u9031\uff15\u6708\uff18\u65e5\u306b\u30da\u30dd\u30eb\u30aa\u30fc\u30bd\u30ea\u30c6\u30a3\uff08\u30c7\u30b8\u5e81\uff09\u306e\u30da\u30fc\u30b8<br \/>\n<a href=\"https:\/\/www.digital.go.jp\/policies\/electronic_invoice\/\">https:\/\/www.digital.go.jp\/policies\/electronic_invoice\/<\/a><br \/>\n\u306bJP PINT 0.9.1\u306e\u66f4\u65b0\u304c\u53cd\u6620\u3055\u308c\u307e\u3057\u305f\u3002<\/p>\n<p>\u516c\u958b\u3055\u308c\u305f\u30da\u30fc\u30b8\u3092\u3072\u3068\u3064\u3072\u3068\u3064\u958b\u3044\u3066\u78ba\u8a8d\u3059\u308b\u306e\u3082\u3088\u3044\u306e\u3067\u3059\u304c\u3001\u4e00\u89a7\u6027\u306b\u6b20\u3051\u308b\u306e\u3067Python3+BeautifulSoup4\u3067\u30da\u30fc\u30b8\u304b\u3089\u5b9a\u7fa9\u60c5\u5831\u3092\u62bd\u51fa\u3059\u308b\u30d7\u30ed\u30b0\u30e9\u30e0\u3092\u4f5c\u6210\u3057\u307e\u3057\u305f\u3002<\/p>\n<pre class=\"height-set:true width-set:false lang:python decode:true \" >#!\/usr\/bin\/env python3\r\n# \r\n# genarate JSON and CSV from PINT and JP PINT 0.9.1\r\n#\r\n# designed by SAMBUICHI, Nobuyuki (Sambuichi Professional Engineers Office)\r\n# written by SAMBUICHI, Nobuyuki (Sambuichi Professional Engineers Office)\r\n#\r\n# MIT License\r\n#\r\n# Copyright (c) 2022 SAMBUICHI Nobuyuki (Sambuichi Professional Engineers Office)\r\n# \r\n# Permission is hereby granted, free of charge, to any person obtaining a copy\r\n# of this software and associated documentation files (the \"Software\"), to deal\r\n# in the Software without restriction, including without limitation the rights\r\n# to use, copy, modify, merge, publish, distribute, sublicense, and\/or sell\r\n# copies of the Software, and to permit persons to whom the Software is\r\n# furnished to do so, subject to the following conditions:\r\n# The above copyright notice and this permission notice shall be included in all\r\n# copies or substantial portions of the Software.\r\n# THE SOFTWARE IS PROVIDED \"AS IS\", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR\r\n# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,\r\n# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE\r\n# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER\r\n# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,\r\n# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE\r\n# SOFTWARE.\r\n# \r\nfrom bs4 import BeautifulSoup\r\nimport bs4\r\nfrom urllib import request\r\nimport urllib\r\nimport os\r\nimport sys\r\nimport ssl\r\nimport json\r\nimport csv\r\n\r\nssl._create_default_https_context = ssl._create_unverified_context\r\n\r\npint_url = 'https:\/\/test-docs.peppol.eu\/poacc\/pint\/pint02\/pint\/'\r\npint_semantic_url = 'https:\/\/test-docs.peppol.eu\/poacc\/pint\/pint02\/pint\/trn-invoice\/semantic-model\/'\r\npint_syntax_url = 'https:\/\/test-docs.peppol.eu\/poacc\/pint\/pint02\/pint\/trn-invoice\/syntax\/'\r\n\r\nroot_url = 'https:\/\/test-docs.peppol.eu\/pint\/pint-jp\/work-v1\/'\r\njp_semantic_url = 'https:\/\/test-docs.peppol.eu\/pint\/pint-jp\/work-v1\/pint-jp\/trn-invoice\/semantic-model\/'\r\njp_syntax_url = 'https:\/\/test-docs.peppol.eu\/pint\/pint-jp\/work-v1\/pint-jp\/trn-invoice\/syntax\/'\r\n\r\n# root_rule_url = 'https:\/\/test-docs.peppol.eu\/pint\/pint-jp\/work-v1\/pint-jp\/trn-invoice\/rule\/' \r\n# shared_rule_url = 'https:\/\/test-docs.peppol.eu\/pint\/pint-jp\/work-v1\/pint-jp\/trn-invoice\/rule\/PINT-UBL-validation-preprocessed\/'\r\n# aligned_rule_url = 'https:\/\/test-docs.peppol.eu\/pint\/pint-jp\/work-v1\/pint-jp\/trn-invoice\/rule\/PINT-jurisdiction-aligned-rules\/'\r\n\r\ndef parse_semantic(PINT,base_url,out_file,ITEM): \r\n    try: \r\n        response = request.urlopen(base_url) \r\n        soup = BeautifulSoup(response, \"lxml\")\r\n        response.close()\r\n        # \r\n        tr_s = soup.find_all('tr')\r\n        header = [x.text.strip() for x in tr_s[0].find_all('th')]\r\n        results = [] \r\n        for i in range(len(tr_s)-1):\r\n            item = [x.text.strip() for x in tr_s[i+1].find_all('td')]\r\n            id = item[0].lower()\r\n            term = item[1]\r\n            t = term.replace(\"\u2022\\xa0\",'_')\r\n            t1 = t.replace('_ ','_')\r\n            t2 = t1.replace('_','')\r\n            level = len(t1) - len(t2)\r\n            section = item[2]\r\n            card = item[3]\r\n            desc = item[4]\r\n            data = {}\r\n            data[header[0]] = id\r\n            data['level'] = level\r\n            data[header[1]] = t2\r\n            data[header[2]] = section\r\n            data[header[3]] = card\r\n            data[header[4]] = desc\r\n            if 'PINT'==PINT:\r\n                item_url = f'{pint_semantic_url}{id}\/'\r\n            elif 'JP_PINT'==PINT:\r\n                item_url = f'{jp_semantic_url}{id}\/'\r\n            data['item_url'] = item_url\r\n            response = request.urlopen(item_url)\r\n            soup = BeautifulSoup(response, \"lxml\")\r\n            response.close()\r\n            dl = soup.find('dl')\r\n            title = [x.text for x in dl.find_all('dt')]\r\n            value = [x for x in dl.find_all('dd')]\r\n            for idx in range(len(title)):\r\n                data[title[idx]] = value[idx].text\r\n            Syntaxbinding = value[-1]\r\n            Syntaxbindings = Syntaxbinding.find_all('a')\r\n            path_url = ''\r\n            if len(Syntaxbindings) &gt; 0:\r\n                if 'ubl:' in Syntaxbinding.text:\r\n                    Syntaxbinding = '\/'+Syntaxbinding.text.strip().replace(' \/ ','\/')\r\n                else:\r\n                    Syntaxbinding = ''\r\n                data['Syntax binding'] = Syntaxbinding\r\n                href = Syntaxbindings[-1].attrs['href']\r\n                if 'PINT'==PINT:\r\n                    path_url = f\"{pint_syntax_url[:-36]}{href.replace('..\/','')}\" \r\n                elif 'JP_PINT'==PINT:\r\n                    path_url =  f\"{jp_syntax_url[:-27]}{href.replace('..\/','')}\" \r\n            else:\r\n                Syntaxbinding = ''\r\n            data['Syntax binding'] = Syntaxbinding\r\n            data['path_url'] = path_url\r\n            results.append(data)\r\n            print(f'{i} {id} {level} {t2} {Syntaxbinding}')\r\n        with open(out_file, 'w') as f:\r\n            json.dump(results, f, indent=4)\r\n        print(f'write {ITEM} {out_file}')\r\n    except urllib.error.HTTPError as err:\r\n        print(\"WARN\", err.code, base_url, file=sys.stderr)\r\n        return False\r\n    except urllib.error.URLError as err:\r\n        print(\"ERROR\", err.reason, base_url, file=sys.stderr)\r\n        return False\r\n\r\ndef semantic2csv(out_file,csv_file,ITEM):\r\n    with open(out_file, 'r') as f:\r\n        results = json.load(f)\r\n    keys =  ['Id','level','Business Term','Section','Card.','Definition','item_url','Cardinality','Semantic datatype','Name','Syntax binding','path_url']\r\n    with open(csv_file, 'w', newline='') as output_file:\r\n        dict_writer = csv.DictWriter(output_file, keys)\r\n        dict_writer.writeheader()\r\n        dict_writer.writerows(results)\r\n    print(f'write {ITEM} {csv_file}')\r\n\r\ndef parse_syntax(PINT,base_url,out_file,ITEM):\r\n    try: \r\n        response = request.urlopen(base_url) \r\n        soup = BeautifulSoup(response, \"lxml\")\r\n        response.close()\r\n\r\n        tr_s = soup.find_all('tr')\r\n        header = [x.text.strip() for x in tr_s[0].find_all('th')]\r\n        results = [] \r\n        for i in range(len(tr_s)-1):\r\n            card = None\r\n            level = None\r\n            element_text = None\r\n            id = None\r\n            businessterm = None\r\n            item = [x for x in tr_s[i+1].find_all('td')]\r\n            card = item[0].text.strip()\r\n            element = item[1]\r\n            text = element.text.strip()\r\n            t = text.replace(\"\u2022\\xa0\",'_')\r\n            t1 = t.replace('_ ','_')\r\n            t2 = t1.replace('_','')\r\n            level = len(t1) - len(t2)\r\n            data = {}\r\n            data['card'] = card\r\n            data['level'] = level\r\n            element_text = t2.strip()\r\n            if '\\n' in element_text:\r\n                element_text =element_text.replace('\\n','')\r\n            data['element'] = element_text\r\n            name = item[2].text.strip()\r\n            if name:\r\n                if '\\n' in name:\r\n                    term = name[:name.index('\\n')].strip()\r\n                    desc = name[name.index('\\n')+1:].strip()\r\n                    data['term'] = term\r\n                    data['desc'] = desc\r\n                else:\r\n                    data['term'] = name\r\n            if element:\r\n                el = element_text.replace(':','-')\r\n                if 'ubl-Invoice'==el:\r\n                    data['level'] = None\r\n                else:\r\n                    href = element.find_all('a')[0].attrs['href']\r\n                    if 'PINT'==PINT:\r\n                        path = f\"{pint_syntax_url}{href}\"\r\n                        path_url = path.replace('pint\/trn-invoice\/syntax\/..\/..\/..\/','')\r\n                    elif 'JP_PINT'==PINT:\r\n                        path = f\"{jp_syntax_url}{href}\"\r\n                        path_url = path.replace('pint-jp\/trn-invoice\/syntax\/..\/..\/..\/','')\r\n                    data['path_url'] = path_url\r\n\r\n                    response = request.urlopen(path_url)\r\n                    soup = BeautifulSoup(response, \"lxml\")\r\n                    response.close()\r\n\r\n                    dl = soup.find('dl')\r\n                    title = [x.text for x in dl.find_all('dt')]\r\n                    value = [x for x in dl.find_all('dd')]\r\n                    for idx in range(len(title)):\r\n                        data[title[idx]] = value[idx].text.strip()\r\n                        if 'Business Term'==title[idx]:\r\n                            bt = data[title[idx]]\r\n                            bt = bt.replace('IBG','ibg')\r\n                            bt = bt.replace('IBT','ibt')\r\n                            data[title[idx]] = bt\r\n                            if '\\n' in bt:\r\n                                bt = bt.split('\\n')[1]\r\n                            id = bt[:bt.index(' - ')].strip().lower()\r\n                            businessterm = bt[bt.index(' - ')+3:].strip()\r\n                            data['id'] = id\r\n                            data['businessterm'] = businessterm\r\n            print(f'{i} {card} {level} {element_text} {id} {businessterm}')\r\n            results.append(data)\r\n        with open(out_file, 'w') as f:\r\n            json.dump(results, f, indent=4)\r\n        print(f'write {ITEM} {out_file}')\r\n    except urllib.error.HTTPError as err:\r\n        url = base_url or path\r\n        print(\"WARN\", err.code, url, file=sys.stderr)\r\n        return False\r\n    except urllib.error.URLError as err:\r\n        url = base_url or path\r\n        print(\"ERROR\", err.reason, url, file=sys.stderr)\r\n        return False\r\n\r\ndef syntax2csv(out_file,csv_file,ITEM):\r\n    with open(out_file, 'r') as f:\r\n        results = json.load(f)\r\n    keys = [\"card\",\"level\",\"element\",\"term\",\"desc\",\"path_url\",\"Attribute\",\"Cardinality\",\"Element\",\"Namespace\",\"Selector\",\"Section\",\"Business Term\",\"id\",\"businessterm\"]\r\n    with open(csv_file, 'w', newline='') as output_file:\r\n        dict_writer = csv.DictWriter(output_file, keys)\r\n        dict_writer.writeheader()\r\n        for row in results:\r\n            if 'term' in row and '\\n' in row['term']:\r\n                term = row['term'].replace('\\n','\\\\n')\r\n                row['term'] = term\r\n            if 'desc' in row and '\\n' in row['desc']:\r\n                desc = row['desc'].replace('\\n','\\\\n')\r\n                row['desc'] = desc\r\n            if 'Business Term' in row and '\\n' in row['Business Term']:\r\n                BusinessTerm = row['Business Term'].replace('\\n','\\\\n')\r\n                row['Business Term'] = BusinessTerm\r\n            if 'businessterm' in row and '\\n' in row['businessterm']:\r\n                businessterm = row['businessterm'].replace('\\n','\\\\n')\r\n                row['businessterm'] = businessterm\r\n            dict_writer.writerow(row)\r\n    print(f'write {ITEM} {csv_file}')\r\n\r\ndef parse_rule(base_url, ITEM):\r\n    pass\r\n\r\ndef main():\r\n    dir = os.path.dirname(__file__)\r\n    ITEM = 'Semantic'\r\n    PINT = 'PINT'\r\n    out_file = os.path.join(dir, f'{PINT}_{ITEM}.json')\r\n    csv_file = os.path.join(dir, f'{PINT}_{ITEM}.csv')\r\n    parse_semantic(PINT,pint_semantic_url,out_file,ITEM)    \r\n    semantic2csv(out_file,csv_file,ITEM)\r\n    PINT = 'JP_PINT'\r\n    out_file = os.path.join(dir, f'{PINT}_{ITEM}.json')\r\n    csv_file = os.path.join(dir, f'{PINT}_{ITEM}.csv')\r\n    parse_semantic(PINT,jp_semantic_url,out_file,ITEM)    \r\n    semantic2csv(out_file,csv_file,ITEM)\r\n    ITEM = 'Syntax'\r\n    PINT = 'PINT'\r\n    out_file = os.path.join(dir, f'{PINT}_{ITEM}.json')\r\n    csv_file = os.path.join(dir, f'{PINT}_{ITEM}.csv')\r\n    parse_syntax(PINT,pint_syntax_url,out_file,ITEM)\r\n    syntax2csv(out_file,csv_file,ITEM)\r\n    PINT = 'JP_PINT'\r\n    out_file = os.path.join(dir, f'{PINT}_{ITEM}.json')\r\n    csv_file = os.path.join(dir, f'{PINT}_{ITEM}.csv')\r\n    parse_syntax(PINT,jp_syntax_url,out_file,ITEM)\r\n    syntax2csv(out_file,csv_file,ITEM)\r\n\r\nif __name__ == '__main__':\r\n    main()\r\n<\/pre>\n<h3>CSV<\/h3>\n<h4>JP PINT 0.9.1<\/h4>\n<p><a href=\"https:\/\/www.sambuichi.jp\/wp-content\/uploads\/2022\/05\/JP_PINT_Semantic-1.csv\">JP_PINT_Semantic<\/a><br \/>\n<a href=\"https:\/\/www.sambuichi.jp\/wp-content\/uploads\/2022\/05\/JP_PINT_Syntax-1.csv\">JP_PINT_Syntax<\/a><\/p>\n<h4>PINT 0.1x<\/h4>\n<p><a href=\"https:\/\/www.sambuichi.jp\/wp-content\/uploads\/2022\/05\/PINT_Semantic-1.csv\">PINT_Semantic<\/a><br \/>\n<a href=\"https:\/\/www.sambuichi.jp\/wp-content\/uploads\/2022\/05\/PINT_Syntax-1.csv\">PINT_Syntax<\/a><\/p>\n","protected":false},"excerpt":{"rendered":"<p>Views: 49\u4eca\u9031\uff15\u6708\uff18\u65e5\u306b\u30da\u30dd\u30eb\u30aa\u30fc\u30bd\u30ea\u30c6\u30a3\uff08\u30c7\u30b8\u5e81\uff09\u306e\u30da\u30fc\u30b8 https:\/\/www.digital.go.jp\/policies\/electronic_invoice\/ \u306bJP PINT 0.9.1\u306e\u66f4\u65b0\u304c\u53cd\u6620 [&hellip;]<\/p>\n","protected":false},"author":2,"featured_media":6256,"comment_status":"open","ping_status":"closed","sticky":false,"template":"","format":"standard","meta":[],"categories":[45,49,50,11],"tags":[],"_links":{"self":[{"href":"https:\/\/www.sambuichi.jp\/index.php?rest_route=\/wp\/v2\/posts\/6233"}],"collection":[{"href":"https:\/\/www.sambuichi.jp\/index.php?rest_route=\/wp\/v2\/posts"}],"about":[{"href":"https:\/\/www.sambuichi.jp\/index.php?rest_route=\/wp\/v2\/types\/post"}],"author":[{"embeddable":true,"href":"https:\/\/www.sambuichi.jp\/index.php?rest_route=\/wp\/v2\/users\/2"}],"replies":[{"embeddable":true,"href":"https:\/\/www.sambuichi.jp\/index.php?rest_route=%2Fwp%2Fv2%2Fcomments&post=6233"}],"version-history":[{"count":8,"href":"https:\/\/www.sambuichi.jp\/index.php?rest_route=\/wp\/v2\/posts\/6233\/revisions"}],"predecessor-version":[{"id":6872,"href":"https:\/\/www.sambuichi.jp\/index.php?rest_route=\/wp\/v2\/posts\/6233\/revisions\/6872"}],"wp:featuredmedia":[{"embeddable":true,"href":"https:\/\/www.sambuichi.jp\/index.php?rest_route=\/wp\/v2\/media\/6256"}],"wp:attachment":[{"href":"https:\/\/www.sambuichi.jp\/index.php?rest_route=%2Fwp%2Fv2%2Fmedia&parent=6233"}],"wp:term":[{"taxonomy":"category","embeddable":true,"href":"https:\/\/www.sambuichi.jp\/index.php?rest_route=%2Fwp%2Fv2%2Fcategories&post=6233"},{"taxonomy":"post_tag","embeddable":true,"href":"https:\/\/www.sambuichi.jp\/index.php?rest_route=%2Fwp%2Fv2%2Ftags&post=6233"}],"curies":[{"name":"wp","href":"https:\/\/api.w.org\/{rel}","templated":true}]}}