163 lines
4.4 KiB
Python
163 lines
4.4 KiB
Python
# -*- coding: utf-8 -*-
|
|
# ---
|
|
# jupyter:
|
|
# jupytext:
|
|
# formats: ipynb,py:hydrogen
|
|
# text_representation:
|
|
# extension: .py
|
|
# format_name: hydrogen
|
|
# format_version: '1.3'
|
|
# jupytext_version: 1.14.1
|
|
# kernelspec:
|
|
# display_name: Python 3 (ipykernel)
|
|
# language: python
|
|
# name: python3
|
|
# ---
|
|
|
|
# %% [markdown]
|
|
# # Production d'un csv utilisable de la base FINESS
|
|
#
|
|
# En l'état, l'export CSV de la [base FINESS][finess] n'est pas vraiment satisfaisant et utilisable.
|
|
#
|
|
# - Le fichier n'est pas réellement un CSV.
|
|
# - Il est bizarrement découpé en deux sections qui correspondent au XML.
|
|
# - Les colonnes n'ont pas de nom.
|
|
# - Le fichier est encodé au format windows.
|
|
#
|
|
# [finess]: https://www.data.gouv.fr/en/datasets/finess-extraction-du-fichier-des-etablissements/
|
|
|
|
# %% gradient={"editing": false, "id": "4facc182", "kernelId": ""}
|
|
import pandas as pd
|
|
import numpy as np
|
|
import requests
|
|
|
|
# %% gradient={"editing": false, "id": "3f7b5d32", "kernelId": ""}
|
|
dataset_api = "https://www.data.gouv.fr/api/1/datasets/finess-extraction-du-fichier-des-etablissements/"
|
|
|
|
# %% gradient={"editing": false, "id": "58d641d4", "kernelId": ""}
|
|
resources = (requests
|
|
.get(dataset_api)
|
|
.json()
|
|
['resources']
|
|
)
|
|
|
|
resource_geoloc = [ r for r in resources if r['type'] == 'main' and 'géolocalisés' in r['title']][0]
|
|
|
|
# %% gradient={"editing": false, "id": "13dd939b", "kernelId": ""}
|
|
headers = [
|
|
'section',
|
|
'nofinesset',
|
|
'nofinessej',
|
|
'rs',
|
|
'rslongue',
|
|
'complrs',
|
|
'compldistrib',
|
|
'numvoie',
|
|
'typvoie',
|
|
'voie',
|
|
'compvoie',
|
|
'lieuditbp',
|
|
'commune',
|
|
'departement',
|
|
'libdepartement',
|
|
'ligneacheminement',
|
|
'telephone',
|
|
'telecopie',
|
|
'categetab',
|
|
'libcategetab',
|
|
'categagretab',
|
|
'libcategagretab',
|
|
'siret',
|
|
'codeape',
|
|
'codemft',
|
|
'libmft',
|
|
'codesph',
|
|
'libsph',
|
|
'dateouv',
|
|
'dateautor',
|
|
'maj',
|
|
'numuai'
|
|
]
|
|
|
|
# %% gradient={"editing": false, "id": "b68dac89", "kernelId": ""}
|
|
geoloc_names = [
|
|
'nofinesset',
|
|
'coordxet',
|
|
'coordyet',
|
|
'sourcecoordet',
|
|
'datemaj'
|
|
]
|
|
|
|
# %% gradient={"editing": false, "id": "4492d3dd", "kernelId": ""}
|
|
raw_df = (pd
|
|
.read_csv(resource_geoloc['url'],
|
|
sep=";", encoding="utf-8", header=None, skiprows=1,
|
|
dtype='str',
|
|
names=headers)
|
|
.drop(columns=['section'])
|
|
)
|
|
|
|
raw_df
|
|
|
|
# %% gradient={"editing": false, "id": "2efc14bc", "kernelId": ""}
|
|
structures = (raw_df
|
|
.iloc[:int(raw_df.index.size/2)]
|
|
)
|
|
|
|
structures
|
|
|
|
# %% gradient={"editing": false, "id": "283be3bb", "kernelId": ""}
|
|
geolocalisations = (raw_df
|
|
.iloc[int(raw_df.index.size/2):]
|
|
.drop(columns=raw_df.columns[5:])
|
|
.rename(columns=lambda x: geoloc_names[list(raw_df.columns).index(x)])
|
|
)
|
|
|
|
geolocalisations
|
|
|
|
# %% gradient={"editing": false, "id": "b54e527e", "kernelId": ""}
|
|
clean_df = (structures
|
|
.merge(geolocalisations, on="nofinesset", how="left")
|
|
)
|
|
|
|
clean_df
|
|
|
|
# %%
|
|
clean_df.sample().T
|
|
|
|
# %%
|
|
clean_df["siret"]
|
|
|
|
# %% [markdown] gradient={"editing": false, "id": "82306369-229c-418f-9138-d753e1b71ce4", "kernelId": ""}
|
|
# ## Vérification de la qualité des données
|
|
|
|
# %% gradient={"editing": false, "id": "64975e82-5f97-4bb4-b1d3-8aed85fa37cd", "kernelId": "", "source_hidden": false} jupyter={"outputs_hidden": false}
|
|
intersection = pd.Series(np.intersect1d(structures.nofinesset.values, geolocalisations.nofinesset.values))
|
|
|
|
intersection.shape
|
|
|
|
# %% gradient={"editing": false, "id": "07e3c1cb-7032-4d83-833c-0979d2592f3c", "kernelId": "", "source_hidden": false} jupyter={"outputs_hidden": false}
|
|
only_structures = (structures
|
|
[ ~structures.nofinesset.isin(intersection) ]
|
|
)
|
|
|
|
only_structures
|
|
|
|
# %% gradient={"editing": false, "id": "cfb13e95-b622-4d89-be56-61397dc4370e", "kernelId": "", "source_hidden": false} jupyter={"outputs_hidden": false}
|
|
only_geolocalisations = (geolocalisations
|
|
[ ~geolocalisations.nofinesset.isin(intersection) ]
|
|
)
|
|
|
|
only_geolocalisations
|
|
|
|
# %% gradient={"editing": false, "id": "92cd9e34-74c8-454c-96d8-3c628e7b94bd", "kernelId": "", "source_hidden": false} jupyter={"outputs_hidden": false}
|
|
geolocalisations_missing = []
|
|
|
|
# %% [markdown] gradient={"editing": false, "id": "ff24d2da-6b7e-49ca-8ac9-cc1e90d32235", "kernelId": ""}
|
|
# ## Export final
|
|
|
|
# %% gradient={"editing": false, "id": "8f6f3c73-4c14-4e82-ac63-cdf9ab8e4b21", "kernelId": "", "source_hidden": false} jupyter={"outputs_hidden": false}
|
|
clean_df.to_csv('finess-clean.csv', encoding='utf-8')
|
|
|
|
# %%
|