times_series

Transcription

times_series
times_series
September 27, 2016
1
Times Series
In [ ]: %pylab inline
import matplotlib.pyplot as plt
plt.style.use('ggplot')
Populating the interactive namespace from numpy and matplotlib
In [ ]: from jyquickhelper import add_notebook_menu
add_notebook_menu()
Out[ ]: <IPython.core.display.HTML object>
1.1
Connection to the cluster
In [ ]: import os
hackathon = {}
if "CRCREDENTIALS" in os.environ:
hackathon["blob_storage"], hackathon["password"] = os.environ["CRCREDEN
r = type(hackathon)
else:
from pyquickhelper.ipythonhelper import open_html_form
params={"blob_storage":"", "password":""}
r = open_html_form(params=params,title="server + credentials", key_save
r
Out[ ]: dict
In [ ]: blobstorage = hackathon["blob_storage"]
blobpassword = hackathon["password"]
In [ ]: cl, bs = %blob_open
cl, bs
Out[ ]: (<pyensae.remote.azure_connection.AzureClient at 0xa145d30>,
<azure.storage.blob.blobservice.BlobService at 0xa13a6a0>)
1
1.2
Download data
In [ ]: %blob_ls croix-rouge
Out[ ]:
0
1
2
3
4
5
6
7
8
9
10
11
12
name
build/SINVOICE_M.csv
build/Test_CRFFOR.GACCTMPD.csv
data/ITMMASTER.schema.txt
data/ITMMASTER.txt
data/SINVOICE.schema.txt
data/SINVOICE.txt
data/SINVOICEV.schema.txt
data/SINVOICEV.txt
data/SINVOICEV_.txt
data/SINVOICE_.txt
data/enseignes_france.csv
data/stojou.csv
readme.txt
0
1
2
3
4
5
6
7
8
9
10
11
12
content_type
application/octet-stream
application/octet-stream
application/octet-stream
application/octet-stream
application/octet-stream
application/octet-stream
application/octet-stream
application/octet-stream
application/octet-stream
application/octet-stream
application/octet-stream
application/octet-stream
application/octet-stream
Wed,
Sun,
Mon,
Mon,
Mon,
Mon,
Mon,
Mon,
Tue,
Tue,
Mon,
Mon,
Sun,
18
22
16
09
16
09
16
09
24
24
09
09
22
Nov
Nov
Nov
Nov
Nov
Nov
Nov
Nov
Nov
Nov
Nov
Nov
Nov
content_length
533771533
822231942
5658
103096479
10252
1362433753
7999
1252461865
1252461865
1362433753
6303836
8821375868
45
last_modified
2015 18:56:27 GMT
2015 21:53:38 GMT
2015 23:00:34 GMT
2015 21:41:00 GMT
2015 23:00:35 GMT
2015 21:42:32 GMT
2015 23:00:35 GMT
2015 21:44:08 GMT
2015 15:35:49 GMT
2015 15:34:54 GMT
2015 21:40:54 GMT
2015 21:55:23 GMT
2015 18:00:34 GMT
\
blob_type
BlockBlob
BlockBlob
BlockBlob
BlockBlob
BlockBlob
BlockBlob
BlockBlob
BlockBlob
BlockBlob
BlockBlob
BlockBlob
BlockBlob
BlockBlob
In [ ]: %blob_down croix-rouge/data/SINVOICE.schema.txt SINVOICE.schema.txt
Out[ ]: 'SINVOICE.schema.txt'
In [ ]: import pandas
df = pandas.read_csv("SINVOICE.schema.txt", sep="\t", encoding="utf8")
df.to_excel("SINVOICE.schema.xlsx")
df.head(n=1)
Out[ ]:
0
0
Zone
SIVTYP
Typ
TSV
Menu
0
Long
NaN
Act
NaN
Dim Intitulé normal Intitulé abrégé
1
Type facture
Type fac
Intitulé long Options Table liée Expression de lien
Type facture
NaN TABSIVTYP
NaN
2
\
Copie législation
NaN
\
Annulation Vérification Obligatoire
0
Bloquant
Oui
Non
RAZ Mot-clé d'aide
Non
NaN
In [ ]: %blob_down croix-rouge/data/SINVOICE_.txt SINVOICE_.txt
1.3
1.3.1
Operations, conversion, and some others stuff
change encoding
In [ ]: from ensae_projects.data import change_encoding
change_encoding("SINVOICE_.txt", "SINVOICE_.utf8.txt", enc1="latin-1")
Out[ ]: 1446850
In [ ]: import pyensae
%head SINVOICE_.utf8.txt --n=2
Out[ ]: <IPython.core.display.HTML object>
1.3.2
enumerate rows as dictionary
In [ ]: from ensae_projects.data import enumerate_text_lines
def clean_column_name(s):
return s.replace("_0", "")
for i, row in enumerate(enumerate_text_lines("SINVOICE_.utf8.txt",
encoding="utf-8", header=True,
clean_column_name=clean_column_name)):
print(list(sorted(row.keys())))
break
['ACCDAT', 'ACCNUM', 'AMTATI', 'AMTATIL', 'AMTNOT', 'AMTNOTL', 'AMTTAX', 'AMTTAXUSA
1.3.3
select a few columns and look at them
In [ ]: bigfile = enumerate_text_lines("SINVOICE_.utf8.txt", encoding="utf-8", head
clean_column_name=clean_column_name, fLOG=pri
In [ ]: l = map(lambda row: { "BPR":row["BPR"], "BPRDAT":row["BPRDAT"]}, bigfile)
In [ ]: l = list(l)
len(l)
SINVOICE_.utf8.txt
SINVOICE_.utf8.txt
SINVOICE_.utf8.txt
SINVOICE_.utf8.txt
SINVOICE_.utf8.txt
-
100000
200000
300000
400000
500000
lines
lines
lines
lines
lines
3
SINVOICE_.utf8.txt
SINVOICE_.utf8.txt
SINVOICE_.utf8.txt
SINVOICE_.utf8.txt
SINVOICE_.utf8.txt
SINVOICE_.utf8.txt
SINVOICE_.utf8.txt
SINVOICE_.utf8.txt
SINVOICE_.utf8.txt
-
600000 lines
700000 lines
800000 lines
900000 lines
1000000 lines
1100000 lines
1200000 lines
1300000 lines
1400000 lines
Out[ ]: 1446850
In [ ]: import pandas
df = pandas.DataFrame(l)
In [ ]: df.head()
Out[ ]:
0
1
2
3
4
BPR
100000104
100000177
100000161
100000280
100000198
BPRDAT
28/10/10
28/10/10
28/10/10
28/10/10
28/10/10
In [ ]: df.groupby("BPRDAT").count().plot()
Out[ ]: <matplotlib.axes._subplots.AxesSubplot at 0xa3d3a20>
4
In [ ]:
5