![](https://crypto4nerd.com/wp-content/uploads/2023/06/1oWtSBtfhn1GnCPu5JYp-OQ-1024x576.png)
In Jupyter : my preferred option, but assumes you already have access to a notebook (or are comfortable sorting it out)
In a Snowflake Worksheet: the route less travelled but arguably easier if you don’t have easy access to a notebook environment
First, Make a JSON With Your Credentials
It should look something like this
{
"account": "your_account_name",
"user": "your_user_name",
"password": "insert_your_pwd_here",
"role": "ACCOUNTADMIN"
Next, Make sure you have the required packages
The base python environment should have most of the packages, but probably won;’t have the Snowflake specific stuff, so follow these instructions:
First up create a file pysnowpark_ml.yml and copy this into it
name: pysnowpark_ml
channels:
- https://repo.anaconda.com/pkgs/snowflake
- defaults
dependencies:
- aiofiles=22.1.0=py38hecd8cb5_0
- aiosqlite=0.18.0=py38hecd8cb5_0
- argon2-cffi=21.3.0=pyhd3eb1b0_0
- argon2-cffi-bindings=21.2.0=py38hca72f7f_0
- babel=2.11.0=py38hecd8cb5_0
- backcall=0.2.0=pyhd3eb1b0_0
- brotlipy=0.7.0=py38h9ed2024_1003
- ca-certificates=2023.05.30=hecd8cb5_0
- certifi=2023.5.7=py38hecd8cb5_0
- cffi=1.15.1=py38h6c40b1e_3
- comm=0.1.2=py38hecd8cb5_0
- decorator=5.1.1=pyhd3eb1b0_0
- defusedxml=0.7.1=pyhd3eb1b0_0
- entrypoints=0.4=py38hecd8cb5_0
- flit-core=3.8.0=py38hecd8cb5_0
- icu=58.2=h0a44026_3
- idna=3.4=py38hecd8cb5_0
- importlib-metadata=6.0.0=py38hecd8cb5_0
- importlib_metadata=6.0.0=hd3eb1b0_0
- importlib_resources=5.2.0=pyhd3eb1b0_1
- ipython_genutils=0.2.0=pyhd3eb1b0_1
- jinja2=3.1.2=py38hecd8cb5_0
- json5=0.9.6=pyhd3eb1b0_0
- jsonschema=4.17.3=py38hecd8cb5_0
- jupyter_client=8.1.0=py38hecd8cb5_0
- jupyter_core=5.3.0=py38hecd8cb5_0
- jupyter_events=0.6.3=py38hecd8cb5_0
- jupyter_server=1.23.4=py38hecd8cb5_0
- jupyter_server_fileid=0.9.0=py38hecd8cb5_0
- jupyter_server_ydoc=0.8.0=py38hecd8cb5_1
- jupyter_ydoc=0.2.4=py38hecd8cb5_0
- jupyterlab=3.6.3=py38hecd8cb5_0
- jupyterlab_pygments=0.1.2=py_0
- jupyterlab_server=2.22.0=py38hecd8cb5_0
- libcxx=14.0.6=h9765a3e_0
- libffi=3.4.2=hecd8cb5_6
- libiconv=1.16=hca72f7f_2
- libsodium=1.0.18=h1de35cc_0
- libxml2=2.10.3=h930c0e2_0
- libxslt=1.1.37=h6d1eb0e_0
- lxml=4.9.2=py38h6c40b1e_0
- matplotlib-inline=0.1.6=py38hecd8cb5_0
- ncurses=6.4=hcec6c5f_0
- nest-asyncio=1.5.6=py38hecd8cb5_0
- notebook=6.5.3=py38hecd8cb5_0
- notebook-shim=0.2.2=py38hecd8cb5_0
- openssl=1.1.1u=hca72f7f_0
- packaging=23.0=py38hecd8cb5_0
- pandocfilters=1.5.0=pyhd3eb1b0_0
- parso=0.8.3=pyhd3eb1b0_0
- pexpect=4.8.0=pyhd3eb1b0_3
- pickleshare=0.7.5=pyhd3eb1b0_1003
- pip=22.3.1=py38hecd8cb5_0
- pkgutil-resolve-name=1.3.10=py38hecd8cb5_0
- prometheus_client=0.14.1=py38hecd8cb5_0
- ptyprocess=0.7.0=pyhd3eb1b0_2
- pure_eval=0.2.2=pyhd3eb1b0_0
- pycparser=2.21=pyhd3eb1b0_0
- pyopenssl=23.0.0=py38hecd8cb5_0
- pysocks=1.7.1=py38_1
- python=3.8.16=h218abb5_3
- python-dateutil=2.8.2=pyhd3eb1b0_0
- python-fastjsonschema=2.16.2=py38hecd8cb5_0
- python-json-logger=2.0.7=py38hecd8cb5_0
- pyyaml=6.0=py38h6c40b1e_1
- readline=8.2=hca72f7f_0
- rfc3339-validator=0.1.4=py38hecd8cb5_0
- rfc3986-validator=0.1.1=py38hecd8cb5_0
- send2trash=1.8.0=pyhd3eb1b0_1
- setuptools=65.6.3=py38hecd8cb5_0
- six=1.16.0=pyhd3eb1b0_1
- soupsieve=2.4=py38hecd8cb5_0
- sqlite=3.40.1=h880c91c_0
- stack_data=0.2.0=pyhd3eb1b0_0
- terminado=0.17.1=py38hecd8cb5_0
- tinycss2=1.2.1=py38hecd8cb5_0
- tk=8.6.12=h5d9f67b_0
- tomli=2.0.1=py38hecd8cb5_0
- tornado=6.2=py38hca72f7f_0
- typing_extensions=4.4.0=py38hecd8cb5_0
- wheel=0.37.1=pyhd3eb1b0_0
- xz=5.2.10=h6c40b1e_1
- y-py=0.5.9=py38h7242b5c_0
- yaml=0.2.5=haf1e3a3_0
- ypy-websocket=0.8.2=py38hecd8cb5_0
- zeromq=4.3.4=h23ab428_0
- zlib=1.2.13=h4dc903c_0
- pip:
- alembic==1.10.2
- anyio==3.6.2
- appnope==0.1.3
- arrow==1.2.3
- asn1crypto==1.5.1
- asttokens==2.2.1
- async-generator==1.10
- attrs==22.2.0
- beautifulsoup4==4.11.2
- bleach==6.0.0
- cachetools==4.2.2
- certipy==0.1.3
- charset-normalizer==2.1.1
- cloudpickle==2.0.0
- cmaes==0.9.1
- colorlog==6.7.0
- contourpy==1.0.7
- cryptography==39.0.2
- cycler==0.11.0
- debugpy==1.6.6
- executing==1.2.0
- fastjsonschema==2.16.3
- filelock==3.9.0
- fonttools==4.39.0
- fqdn==1.5.1
- greenlet==2.0.2
- imbalanced-learn==0.10.1
- importlib-resources==5.12.0
- ipykernel==6.21.3
- ipython==8.11.0
- ipywidgets==8.0.4
- isoduration==20.11.0
- jedi==0.18.2
- joblib==1.2.0
- jsonpointer==2.3
- jupyter==1.0.0
- jupyter-client==8.0.3
- jupyter-console==6.6.3
- jupyter-core==5.2.0
- jupyter-server==2.4.0
- jupyter-server-terminals==0.4.4
- jupyter-telemetry==0.1.0
- jupyterhub==4.0.0
- jupyterlab-pygments==0.2.2
- jupyterlab-widgets==3.0.5
- kiwisolver==1.4.4
- lightgbm==3.2.1
- llvmlite==0.40.1rc1
- mako==1.2.4
- markupsafe==2.1.2
- matplotlib==3.7.1
- mistune==2.0.5
- nbclassic==0.5.3
- nbclient==0.7.2
- nbconvert==7.2.9
- nbformat==5.7.3
- numba==0.57.0
- numpy==1.22.3
- oauthlib==3.2.2
- optuna==3.1.0
- oscrypto==1.3.0
- pamela==1.1.0
- pandas==1.4.3
- pillow==9.4.0
- platformdirs==3.1.0
- prometheus-client==0.16.0
- prompt-toolkit==3.0.38
- psutil==5.9.4
- pyarrow==10.0.1
- pycryptodomex==3.17
- pygments==2.14.0
- pyjwt==2.6.0
- pyparsing==3.0.9
- pyrsistent==0.19.3
- pytz==2022.7.1
- pyzmq==25.0.0
- qtconsole==5.4.0
- qtpy==2.3.0
- requests==2.28.2
- ruamel-yaml==0.17.31
- ruamel-yaml-clib==0.2.7
- scikit-learn==1.1.1
- scipy==1.7.1
- seaborn==0.11.2
- sniffio==1.3.0
- snowflake==0.0.3
- snowflake-connector-python==3.0.1
- snowflake-snowpark-python==0.10.0
- sqlalchemy==2.0.5.post1
- stack-data==0.6.2
- threadpoolctl==3.1.0
- tqdm==4.65.0
- traitlets==5.9.0
- typing-extensions==4.5.0
- uri-template==1.2.0
- urllib3==1.26.14
- wcwidth==0.2.6
- webcolors==1.12
- webencodings==0.5.1
- websocket-client==1.5.1
- widgetsnbextension==4.0.5
- xgboost==1.5.0
- zipp==3.15.0
Next open up a command line terminal and type in the following from the same folder that you created the environment.yml file
conda create conda env create -f pysnowpark_ml.yml
Then activate your environment
conda activate pysnowpark_ml
And finally start your Jupyter notebook
jupyter notebook
Note if you don’t have anaconda you can download it by following the instructions below
Next, Spin Up Your Jupyter Notebook Environment
Easiest way to do this is to download Anaconda from here, install and open it and then click on the Jupyter Notebook tile
Which should create a sheet that looks something like:
And then click on the New button like this and select a Python3 Notebook
Which get you to here
Next, Copy and Execute These Cells In Order
from snowflake.snowpark.session import Session
from snowflake.snowpark.functions import sproc
import snowflake.snowpark.functions as F
import snowflake.snowpark.types as T
import pandas as pd
import json
import os
import sys
from sklearn.linear_model import LogisticRegression
from cachetools import cached
snowflake_connection_cfg = json.loads(open("creds.json").read())
session = Session.builder.configs(snowflake_connection_cfg).create()
session.sql("CREATE OR REPLACE WAREHOUSE VB_VH WITH WAREHOUSE_SIZE='X-SMALL'").collect()
session.sql("CREATE OR REPLACE DATABASE VB_DB").collect()
session.sql("CREATE OR REPLACE STAGE ML_MODELS").collect()
from sklearn.datasets import make_classification
import pandas as pd
columns = [str(i) for i in range(0,10)]
X,y = make_classification(n_samples=100000, n_features=10, n_classes=2)
df = pd.DataFrame(X, columns=columns)
feature_cols = df.columns
df['LOAN_DEFAULT'] = y
session.write_pandas(df, table_name='LOAN_DATASET', auto_create_table=True, overwrite=True)
@sproc(session=session, name='get_scores_sf', stage_location='@ML_MODELS',
packages=['snowflake-snowpark-python', 'scikit-learn', 'numpy'],
is_permanent=True,
replace=True)
def train_model(session: Session, train_table: str, feature_cols: list, target_col: str, model_name: str) -> str:
from sklearn.linear_model import LogisticRegression
from joblib import dump
local_training_data = session.table(train_table).to_pandas()
X,y = local_training_data[feature_cols], local_training_data[target_col]
lrmodel = LogisticRegression().fit(X, y)
dump(lrmodel, '/tmp/'+model_name)
session.file.put('/tmp/'+model_name, '@ML_MODELS', auto_compress=False, overwrite=True)
return('ML Model trained and saved')train_model("LOAN_DATASET", list(feature_cols), 'LOAN_DEFAULT', 'lr_model.sav')
@cached(cache={})
def load_model(model_path: str) -> object:
from joblib import load
model = load(model_path)
return modeldef udf_model(df: pd.DataFrame) -> pd.Series:
model = load_model(sys._xoptions["snowflake_import_directory"]+'lr_model.sav')
scored_data = pd.Series(model.predict(df))
return scored_data
udf_model = session.udf.register(session=session,func=udf_model, name="udf_score_lr_model", stage_location='@ML_MODELS',
input_types=[T.FloatType()]*len(feature_cols),
return_type=T.FloatType(),
replace=True, is_permanent=True,
max_batch_size=1000,
imports=['@ML_MODELS/lr_model.sav'],
packages=['scikit-learn==1.1.1','pandas','joblib','cachetools'])
test_sdf = session.table('LOAN_DATASET')
test_sdf.with_column('DEFAULT_PREDICTION', udf_model(*feature_cols)).to_pandas()
Which should result in something like this
This will look pretty much the same, but naturally there will be some tweaks that reflect the different IDE