feat: Ajout des scripts d'import

This commit is contained in:
2023-02-15 15:38:11 +01:00
parent 471b194408
commit a6a7e50ab9
12 changed files with 378 additions and 6 deletions

162
scripts/finess-clean.py Normal file
View File

@ -0,0 +1,162 @@
# -*- coding: utf-8 -*-
# ---
# jupyter:
# jupytext:
# formats: ipynb,py:hydrogen
# text_representation:
# extension: .py
# format_name: hydrogen
# format_version: '1.3'
# jupytext_version: 1.14.1
# kernelspec:
# display_name: Python 3 (ipykernel)
# language: python
# name: python3
# ---
# %% [markdown]
# # Production d'un csv utilisable de la base FINESS
#
# En l'état, l'export CSV de la [base FINESS][finess] n'est pas vraiment satisfaisant et utilisable.
#
# - Le fichier n'est pas réellement un CSV.
# - Il est bizarrement découpé en deux sections qui correspondent au XML.
# - Les colonnes n'ont pas de nom.
# - Le fichier est encodé au format windows.
#
# [finess]: https://www.data.gouv.fr/en/datasets/finess-extraction-du-fichier-des-etablissements/
# %% gradient={"editing": false, "id": "4facc182", "kernelId": ""}
import pandas as pd
import numpy as np
import requests
# %% gradient={"editing": false, "id": "3f7b5d32", "kernelId": ""}
dataset_api = "https://www.data.gouv.fr/api/1/datasets/finess-extraction-du-fichier-des-etablissements/"
# %% gradient={"editing": false, "id": "58d641d4", "kernelId": ""}
resources = (requests
.get(dataset_api)
.json()
['resources']
)
resource_geoloc = [ r for r in resources if r['type'] == 'main' and 'géolocalisés' in r['title']][0]
# %% gradient={"editing": false, "id": "13dd939b", "kernelId": ""}
headers = [
'section',
'nofinesset',
'nofinessej',
'rs',
'rslongue',
'complrs',
'compldistrib',
'numvoie',
'typvoie',
'voie',
'compvoie',
'lieuditbp',
'commune',
'departement',
'libdepartement',
'ligneacheminement',
'telephone',
'telecopie',
'categetab',
'libcategetab',
'categagretab',
'libcategagretab',
'siret',
'codeape',
'codemft',
'libmft',
'codesph',
'libsph',
'dateouv',
'dateautor',
'maj',
'numuai'
]
# %% gradient={"editing": false, "id": "b68dac89", "kernelId": ""}
geoloc_names = [
'nofinesset',
'coordxet',
'coordyet',
'sourcecoordet',
'datemaj'
]
# %% gradient={"editing": false, "id": "4492d3dd", "kernelId": ""}
raw_df = (pd
.read_csv(resource_geoloc['url'],
sep=";", encoding="utf-8", header=None, skiprows=1,
dtype='str',
names=headers)
.drop(columns=['section'])
)
raw_df
# %% gradient={"editing": false, "id": "2efc14bc", "kernelId": ""}
structures = (raw_df
.iloc[:int(raw_df.index.size/2)]
)
structures
# %% gradient={"editing": false, "id": "283be3bb", "kernelId": ""}
geolocalisations = (raw_df
.iloc[int(raw_df.index.size/2):]
.drop(columns=raw_df.columns[5:])
.rename(columns=lambda x: geoloc_names[list(raw_df.columns).index(x)])
)
geolocalisations
# %% gradient={"editing": false, "id": "b54e527e", "kernelId": ""}
clean_df = (structures
.merge(geolocalisations, on="nofinesset", how="left")
)
clean_df
# %%
clean_df.sample().T
# %%
clean_df["siret"]
# %% [markdown] gradient={"editing": false, "id": "82306369-229c-418f-9138-d753e1b71ce4", "kernelId": ""}
# ## Vérification de la qualité des données
# %% gradient={"editing": false, "id": "64975e82-5f97-4bb4-b1d3-8aed85fa37cd", "kernelId": "", "source_hidden": false} jupyter={"outputs_hidden": false}
intersection = pd.Series(np.intersect1d(structures.nofinesset.values, geolocalisations.nofinesset.values))
intersection.shape
# %% gradient={"editing": false, "id": "07e3c1cb-7032-4d83-833c-0979d2592f3c", "kernelId": "", "source_hidden": false} jupyter={"outputs_hidden": false}
only_structures = (structures
[ ~structures.nofinesset.isin(intersection) ]
)
only_structures
# %% gradient={"editing": false, "id": "cfb13e95-b622-4d89-be56-61397dc4370e", "kernelId": "", "source_hidden": false} jupyter={"outputs_hidden": false}
only_geolocalisations = (geolocalisations
[ ~geolocalisations.nofinesset.isin(intersection) ]
)
only_geolocalisations
# %% gradient={"editing": false, "id": "92cd9e34-74c8-454c-96d8-3c628e7b94bd", "kernelId": "", "source_hidden": false} jupyter={"outputs_hidden": false}
geolocalisations_missing = []
# %% [markdown] gradient={"editing": false, "id": "ff24d2da-6b7e-49ca-8ac9-cc1e90d32235", "kernelId": ""}
# ## Export final
# %% gradient={"editing": false, "id": "8f6f3c73-4c14-4e82-ac63-cdf9ab8e4b21", "kernelId": "", "source_hidden": false} jupyter={"outputs_hidden": false}
clean_df.to_csv('finess-clean.csv', encoding='utf-8')
# %%

104
scripts/finess-sisa.py Normal file
View File

@ -0,0 +1,104 @@
# import pandas with shortcut 'pd'
import pandas as pd
import os
from pyproj import Transformer, transform
transformer = Transformer.from_crs(2154, 4326)
headers = [
'section',
'nofinesset',
'nofinessej',
'rs',
'rslongue',
'complrs',
'compldistrib',
'numvoie',
'typvoie',
'voie',
'compvoie',
'lieuditbp',
'commune',
'departement',
'libdepartement',
'ligneacheminement',
'telephone',
'telecopie',
'categetab',
'libcategetab',
'categagretab',
'libcategagretab',
'siret',
'codeape',
'codemft',
'libmft',
'codesph',
'libsph',
'dateouv',
'dateautor',
'maj',
'numuai',
'coordxet',
'coordyet',
'sourcecoordet',
'datemaj'
]
# read_csv function which is used to read the required CSV file
data = pd.read_csv('./finess-clean.csv', sep=",", dtype='str', names=headers)
# display
#print("Original 'input.csv' CSV Data: \n")
#print(data)
header_drop = [
'section',
# 'nofinesset',
'nofinessej',
'rs',
#'rslongue',
'complrs',
'compldistrib',
'numvoie',
'typvoie',
'voie',
'compvoie',
'lieuditbp',
'commune',
#'departement',
'libdepartement',
'ligneacheminement',
#'telephone',
'telecopie',
'categetab',
'libcategetab',
'categagretab',
'libcategagretab',
#'siret',
'codeape',
'codemft',
'libmft',
'codesph',
'libsph',
'dateouv',
'dateautor',
'maj',
'numuai',
#'coordxet',
#'coordyet',
'sourcecoordet',
'datemaj'
]
data = data.query('categetab == "603"')
# drop function which is used in removing or deleting rows or columns from the CSV files
data.drop(header_drop, inplace=True, axis=1)
def convertCoord (row):
row.coordxet, row.coordyet = transformer.transform(row.coordxet, row.coordyet)
return row
data.transform(convertCoord, axis=1)
data.to_json('../static/data.json', orient='values') #https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.to_json.html?highlight=to_json#pandas.DataFrame.to_json

3
scripts/requirements.txt Normal file
View File

@ -0,0 +1,3 @@
pandas==1.5.0
requests==2.28.1
pyproj==3.4.0