feat: Ajout des scripts d'import

2023-02-15 15:38:11 +01:00
parent 471b194408
commit a6a7e50ab9
12 changed files with 378 additions and 6 deletions
--- a/.drone.yml
+++ b/.drone.yml
@ -0,0 +1,51 @@
 ---
 # drone encrypt P4Pillon/annuaire $AWS_ACCESS_KEY_ID
 kind: secret
 name: PRODUCTION_AWS_ACCESS_KEY_ID
 data: msNI263HuJxTaNJ1ljO7SAH4v8RFFF/RlzwXCVnGtmrjLMF02ab1TYgOJq8WSUuYVSjQnVwi
 ---
 # drone encrypt P4Pillon/annuaire $AWS_SECRET_ACCESS_KEY
 kind: secret
 name: PRODUCTION_AWS_SECRET_ACCESS_KEY
 data: LgbdoMtBw9NOcvrpCmzhmZMEneFNzFXjODTJ6relyZkAHeYX8JtXSwSbss2d824wc/ANJZ9Pox10FhL99A33c6IhT9+QVXKme0S/ZuD6CMcWMx6fRHvlL2li2IQ=
 ---
 kind: pipeline
 type: docker
 name: prod
 steps:
 - name: Import
  image: python:3
  commands:
  - (cd scripts && pip install -r requirements.txt)
  - (cd scripts && python3 finess-clean.py)
  - (cd scripts && python3 finess-sisa.py)
 - name: build website
  image: klakegg/hugo:0.101.0-ext-debian-ci
  commands:
    - hugo --minify --environment production
 - name: deploy
  image: klakegg/hugo:0.101.0-ext-debian-ci
  environment:
    AWS_ACCESS_KEY_ID:
      from_secret: PRODUCTION_AWS_ACCESS_KEY_ID
    AWS_SECRET_ACCESS_KEY:
      from_secret: PRODUCTION_AWS_SECRET_ACCESS_KEY
  commands:
    - hugo deploy --environment production
 - name: notify
  image: plugins/matrix@sha256:f1affb31b0c86963c97c6f976fa0dcb3cc84272057fd8558d609d28b3064bd7f
  settings:
    homeserver: https://converser.eu
    roomid: "QwOITmkKxRJJyCSDOZ:converser.eu"
    userid: "resilien:converser.eu"
    accesstoken:
      from_secret: MATRIX_ACCESSTOKEN
  when:
    status: [ failure ]
--- a/.gitignore
+++ b/.gitignore
@ -1,2 +1,5 @@
 public
 resources
 .hugo_build.lock
 scripts/*.csv
 static/data.json
--- a/README.md
+++ b/README.md
@ -0,0 +1,3 @@
 # P4Pillon annuaire
 Mise en place d'une cartographie des SISA en France.
--- a/config.toml
+++ b/config.toml
@ -1,3 +0,0 @@
 baseURL = 'http://example.org/'
 languageCode = 'en-us'
 title = 'My New Hugo Site'
--- a/config/_default/config.yml
+++ b/config/_default/config.yml
@ -0,0 +1,26 @@
 languageCode: fr-fr
 defaultContentLanguage: fr
 title: Annuaire
 disableKinds:
  - taxonomy
  - term
 params:
  debug: false
  description: Site d'annuaire de maison de santé en France
  Keywords: Carte Sisa Maison santé
 permalinks:
  actualites: /actualites/:year/:month/:title/
 markup:
  goldmark:
    renderer:
      unsafe: true
 disableHugoGeneratorInject: true
 enableRobotsTXT: true
 timeout: 200s
--- a/config/development/config.yml
+++ b/config/development/config.yml
@ -0,0 +1 @@
 baseURL: http://localhost:1313/
--- a/config/production/config.yml
+++ b/config/production/config.yml
@ -0,0 +1,7 @@
 baseURL: https://annuaire.p4pillon.org/
 deployment:
  targets:
    - name: production
      URL: >-
        s3://annuaire.p4pillon.org?endpoint=https://s3.garage.resilien.cloud&disableSSL=true&s3ForcePathStyle=true&region=garage
--- a/layouts/index.html
+++ b/layouts/index.html
@ -38,13 +38,29 @@
 		attribution: '&copy; <a href="http://www.openstreetmap.org/copyright">OpenStreetMap</a>'
 	}).addTo(map);
-	var json = fetch('./finess-small.json').then(response => {
+	const columns = {
 		finessET: 0,
 		name: 1,
 		dep: 2,
 		tel: 3,
 		siret: 4,
 		x: 5,
 		y: 6,
 	}
 	var json = fetch('./data.json').then(response => {
 	return response.json();
 	})
 	.then(jsondata => {
 		var markersCluster = new L.MarkerClusterGroup();
 		for (const msp of jsondata) {
-			const marker = L.marker([msp[5], msp[6]]).bindPopup(msp[0] + "(" + msp[2] + ")<br><a href='tel:" + msp[3] + "'>" + msp[3] + "</a>");
+			const marker = L
 				.marker([msp[columns.x], msp[columns.y]])
 				.bindPopup(
 					msp[columns.name] + " (" + msp[columns.dep] + ")<br>" +
 					"Établissement FINESS N°" + msp[columns.finessET] + "<br>" +
 					(msp[columns.siret] != null ? "SIREN : <a rel='noreferrer' target='_blank' href='https://data.inpi.fr/entreprises/" + msp[columns.siret].substring(0, 9) + "'>" + msp[columns.siret].substring(0, 9) + "</a><br>" : "") +
 					(msp[columns.tel] != null ? "<a href='tel:" + msp[columns.tel] + "'>" + msp[columns.tel] + "</a>" : "")
 				);
 			markersCluster.addLayer(marker);
 		}
 		map.addLayer(markersCluster);
--- a/scripts/finess-clean.py
+++ b/scripts/finess-clean.py
@ -0,0 +1,162 @@
 # -*- coding: utf-8 -*-
 # ---
 # jupyter:
 #   jupytext:
 #     formats: ipynb,py:hydrogen
 #     text_representation:
 #       extension: .py
 #       format_name: hydrogen
 #       format_version: '1.3'
 #       jupytext_version: 1.14.1
 #   kernelspec:
 #     display_name: Python 3 (ipykernel)
 #     language: python
 #     name: python3
 # ---
 # %% [markdown]
 # # Production d'un csv utilisable de la base FINESS
 #
 # En l'état, l'export CSV de la [base FINESS][finess] n'est pas vraiment satisfaisant et utilisable.
 #
 # - Le fichier n'est pas réellement un CSV.
 #     - Il est bizarrement découpé en deux sections qui correspondent au XML.
 #     - Les colonnes n'ont pas de nom.
 # - Le fichier est encodé au format windows.
 #
 # [finess]: https://www.data.gouv.fr/en/datasets/finess-extraction-du-fichier-des-etablissements/
 # %% gradient={"editing": false, "id": "4facc182", "kernelId": ""}
 import pandas as pd
 import numpy as np
 import requests
 # %% gradient={"editing": false, "id": "3f7b5d32", "kernelId": ""}
 dataset_api = "https://www.data.gouv.fr/api/1/datasets/finess-extraction-du-fichier-des-etablissements/"
 # %% gradient={"editing": false, "id": "58d641d4", "kernelId": ""}
 resources = (requests
    .get(dataset_api)
    .json()
    ['resources']
 )
 resource_geoloc = [ r for r in resources if r['type'] == 'main' and 'géolocalisés' in r['title']][0]
 # %% gradient={"editing": false, "id": "13dd939b", "kernelId": ""}
 headers = [
    'section',
    'nofinesset',
    'nofinessej',
    'rs',
    'rslongue',
    'complrs',
    'compldistrib',
    'numvoie',
    'typvoie',
    'voie',
    'compvoie',
    'lieuditbp',
    'commune',
    'departement',
    'libdepartement',
    'ligneacheminement',
    'telephone',
    'telecopie',
    'categetab',
    'libcategetab',
    'categagretab',
    'libcategagretab',
    'siret',
    'codeape',
    'codemft',
    'libmft',
    'codesph',
    'libsph',
    'dateouv',
    'dateautor',
    'maj',
    'numuai'
 ]
 # %% gradient={"editing": false, "id": "b68dac89", "kernelId": ""}
 geoloc_names = [
    'nofinesset',
    'coordxet',
    'coordyet',
    'sourcecoordet',
    'datemaj'
 ]
 # %% gradient={"editing": false, "id": "4492d3dd", "kernelId": ""}
 raw_df = (pd
    .read_csv(resource_geoloc['url'],
              sep=";", encoding="utf-8", header=None, skiprows=1,
              dtype='str',
              names=headers)
    .drop(columns=['section'])
 )
 raw_df
 # %% gradient={"editing": false, "id": "2efc14bc", "kernelId": ""}
 structures = (raw_df
    .iloc[:int(raw_df.index.size/2)]
 )
 structures
 # %% gradient={"editing": false, "id": "283be3bb", "kernelId": ""}
 geolocalisations = (raw_df
    .iloc[int(raw_df.index.size/2):]
    .drop(columns=raw_df.columns[5:])
    .rename(columns=lambda x: geoloc_names[list(raw_df.columns).index(x)])
 )
 geolocalisations
 # %% gradient={"editing": false, "id": "b54e527e", "kernelId": ""}
 clean_df = (structures
    .merge(geolocalisations, on="nofinesset", how="left")
 )
 clean_df
 # %%
 clean_df.sample().T
 # %%
 clean_df["siret"]
 # %% [markdown] gradient={"editing": false, "id": "82306369-229c-418f-9138-d753e1b71ce4", "kernelId": ""}
 # ## Vérification de la qualité des données
 # %% gradient={"editing": false, "id": "64975e82-5f97-4bb4-b1d3-8aed85fa37cd", "kernelId": "", "source_hidden": false} jupyter={"outputs_hidden": false}
 intersection = pd.Series(np.intersect1d(structures.nofinesset.values, geolocalisations.nofinesset.values))
 intersection.shape
 # %% gradient={"editing": false, "id": "07e3c1cb-7032-4d83-833c-0979d2592f3c", "kernelId": "", "source_hidden": false} jupyter={"outputs_hidden": false}
 only_structures = (structures
    [ ~structures.nofinesset.isin(intersection) ]
 )
 only_structures
 # %% gradient={"editing": false, "id": "cfb13e95-b622-4d89-be56-61397dc4370e", "kernelId": "", "source_hidden": false} jupyter={"outputs_hidden": false}
 only_geolocalisations = (geolocalisations
    [ ~geolocalisations.nofinesset.isin(intersection) ]
 )
 only_geolocalisations
 # %% gradient={"editing": false, "id": "92cd9e34-74c8-454c-96d8-3c628e7b94bd", "kernelId": "", "source_hidden": false} jupyter={"outputs_hidden": false}
 geolocalisations_missing = []
 # %% [markdown] gradient={"editing": false, "id": "ff24d2da-6b7e-49ca-8ac9-cc1e90d32235", "kernelId": ""}
 # ## Export final
 # %% gradient={"editing": false, "id": "8f6f3c73-4c14-4e82-ac63-cdf9ab8e4b21", "kernelId": "", "source_hidden": false} jupyter={"outputs_hidden": false}
 clean_df.to_csv('finess-clean.csv', encoding='utf-8')
 # %%
--- a/scripts/finess-sisa.py
+++ b/scripts/finess-sisa.py
@ -0,0 +1,104 @@
 # import pandas with shortcut 'pd'
 import pandas as pd  
 import os
 from pyproj import Transformer, transform
 transformer = Transformer.from_crs(2154, 4326)
 headers = [
    'section',
    'nofinesset',
    'nofinessej',
    'rs',
    'rslongue',
    'complrs',
    'compldistrib',
    'numvoie',
    'typvoie',
    'voie',
    'compvoie',
    'lieuditbp',
    'commune',
    'departement',
    'libdepartement',
    'ligneacheminement',
    'telephone',
    'telecopie',
    'categetab',
    'libcategetab',
    'categagretab',
    'libcategagretab',
    'siret',
    'codeape',
    'codemft',
    'libmft',
    'codesph',
    'libsph',
    'dateouv',
    'dateautor',
    'maj',
    'numuai',
    'coordxet',
    'coordyet',
    'sourcecoordet',
    'datemaj'
 ]
 # read_csv function which is used to read the required CSV file
 data = pd.read_csv('./finess-clean.csv', sep=",", dtype='str', names=headers)
 # display 
 #print("Original 'input.csv' CSV Data: \n")
 #print(data)
 header_drop = [
    'section',
    # 'nofinesset',
    'nofinessej',
    'rs',
    #'rslongue',
    'complrs',
    'compldistrib',
    'numvoie',
    'typvoie',
    'voie',
    'compvoie',
    'lieuditbp',
    'commune',
    #'departement',
    'libdepartement',
    'ligneacheminement',
    #'telephone',
    'telecopie',
    'categetab',
    'libcategetab',
    'categagretab',
    'libcategagretab',
    #'siret',
    'codeape',
    'codemft',
    'libmft',
    'codesph',
    'libsph',
    'dateouv',
    'dateautor',
    'maj',
    'numuai',
    #'coordxet',
    #'coordyet',
    'sourcecoordet',
    'datemaj'
 ]
 data = data.query('categetab == "603"')
 # drop function which is used in removing or deleting rows or columns from the CSV files
 data.drop(header_drop, inplace=True, axis=1)
 def convertCoord (row):
    row.coordxet, row.coordyet = transformer.transform(row.coordxet, row.coordyet)
    return row
 data.transform(convertCoord, axis=1)
 data.to_json('../static/data.json', orient='values') #https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.to_json.html?highlight=to_json#pandas.DataFrame.to_json
--- a/scripts/requirements.txt
+++ b/scripts/requirements.txt
@ -0,0 +1,3 @@
 pandas==1.5.0
 requests==2.28.1
 pyproj==3.4.0
--- a/static/finess-small.json
+++ b/static/finess-small.json
		`@ -0,0 +1,3 @@`
							`# P4Pillon annuaire`

							`Mise en place d'une cartographie des SISA en France.`