Terence Junjie LIU

A pseudo programmer's blog

Writing Python with R

Terence Lau / 2020-09-03


Or, This post should be named as: Calling Python in R.
R_and_Python/Python_and_R

0

R(R Core Team 2019) and Python are two tools for data analysis, data mining or other data related works. As we all know, these works are all laborious and time-consuming and computer, or programming languages, are the tools can release our burden.

Please, not arguing which language is the best, just choose whatever you like and whatever you need.

Introduction

In 2018, Rstudio(RStudio Team 2020), the company produced/released a package called reticulate, a **comprehensive set of tools for r interoperability (互通性) between Python and R. This package contains these functions:

I would like to say this package may change the lifestyle/workstyle of statistics-related research: since they focus on R or traditional statistical method for so many years and may forget the CS’s extrondinary progress: neural-network, deep learning’s, etc. There are so many package are written by python or faced to python user.

Example

First, installation:

# install.packages("reticulate", dep = T, repos="https://mirrors.tuna.tsinghua.edu.cn/CRAN/")

Then you can just use it!

library(reticulate)
## Warning: package 'reticulate' was built under R version 4.0.3
# use_python()
# use_virtualenv()
Sys.which("python")
##                                                                        python 
## "C:\\Users\\TERENC~1\\AppData\\Local\\Programs\\Python\\Python38\\python.exe"
pandas <- import('pandas')
data <- pandas$read_csv('dataset/Stock_FX_Bond.csv')
attach(data)
par(mfrow = c(1, 2))
plot(GM_AC, type = "l")
plot(F_AC, type = "b")

n = dim(data)[1]
GMReturn = GM_AC[-1] / GM_AC[-n] - 1 # index -1/-n means all indices except the first/last
FReturn = F_AC[-1] / F_AC[-n] - 1
par(mfrow = c(1,1))
plot(GMReturn, FReturn)

You can also use Pytorch , An open source machine learning framework which can accelerates your programme.

torch <- import('torch')
torch$set_default_dtype(torch$float64)
torch$tensor(list(1.2, 3))
## tensor([1.2000, 3.0000])
byte    <- torch$ByteTensor(3L, 3L)
float   <- torch$FloatTensor(3L, 3L)
double  <- torch$DoubleTensor(3L, 3L)
long    <- torch$LongTensor(3L, 3L)
boolean <- torch$BoolTensor(5L, 5L)
byte
## tensor([[ 0,  0,  0],
##         [ 0,  0,  0],
##         [ 0,  0, 48]], dtype=torch.uint8)
float
## tensor([[0., 0., 0.],
##         [0., 0., 0.],
##         [0., 0., 0.]], dtype=torch.float32)
double
## tensor([[8.4598e-315, 8.4598e-315, 8.4598e-315],
##         [8.4598e-315, 8.5118e-315, 8.4897e-315],
##         [8.5573e-315, 8.5573e-315, 8.5571e-315]])
long
## tensor([[1731988880, 1718339056, 1732027856],
##         [1718338608, 1712276816, 1712276816],
##         [1732028624, 1732009488, 1731965328]])
boolean
## tensor([[ True, False, False, False, False],
##         [False, False, False, False, False],
##         [False, False, False, False, False],
##         [False, False, False, False, False],
##         [False, False, False, False, False]])

Here is an example of using torch to do the MNIST handwritten digits:(Choi 2018)

torch <- import('torch')
torchvision <- import('torchvision')
nn          <- torch$nn
transforms  <- torchvision$transforms
torch$set_default_dtype(torch$float)

# Hyper-parameters 
input_size    <- 784L
num_classes   <- 10L
num_epochs    <- 5L
batch_size    <- 100L
learning_rate <- 0.001

# MNIST dataset (images and labels)
# IDX format
local_folder <- './datasets/raw_data'
train_dataset = torchvision$datasets$MNIST(root=local_folder, 
                                           train=TRUE, 
                                           transform=transforms$ToTensor(),
                                           download=TRUE)

test_dataset = torchvision$datasets$MNIST(root=local_folder, 
                                          train=FALSE, 
                                          transform=transforms$ToTensor())

# Data loader (input pipeline). Make the datasets iteratble
train_loader = torch$utils$data$DataLoader(dataset=train_dataset, 
                                           batch_size=batch_size, 
                                           shuffle=TRUE)

test_loader = torch$utils$data$DataLoader(dataset=test_dataset, 
                                          batch_size=batch_size, 
                                          shuffle=FALSE)

class(train_loader)
## [1] "torch.utils.data.dataloader.DataLoader"
## [2] "python.builtin.object"
#> [1] "torch.utils.data.dataloader.DataLoader"
#> [2] "python.builtin.object"
length(train_loader)
## [1] 2
#> [1] 2

# Logistic regression model
model = nn$Linear(input_size, num_classes)

# Loss and optimizer
# nn.CrossEntropyLoss() computes softmax internally
criterion = nn$CrossEntropyLoss()  
optimizer = torch$optim$SGD(model$parameters(), lr=learning_rate)  
print(model)
## Linear(in_features=784, out_features=10, bias=True)
#> Linear(in_features=784, out_features=10, bias=True)

# Train the model
iter_train_loader <- iterate(train_loader)
total_step <-length(iter_train_loader)

for (epoch in 1:num_epochs) {
    i <-  0
    for (obj in iter_train_loader) {
        
        images <- obj[[1]]   # tensor torch.Size([64, 3, 28, 28])
        labels <- obj[[2]]   # tensor torch.Size([64]), labels from 0 to 9
        # cat(i, "\t"); print(images$shape)

        # Reshape images to (batch_size, input_size)
        images <- images$reshape(-1L, 28L*28L)
        # images <- torch$as_tensor(images$reshape(-1L, 28L*28L), dtype=torch$double)

        # Forward pass
        outputs <- model(images)
        loss <- criterion(outputs, labels)

        # Backward and optimize
        optimizer$zero_grad()
        loss$backward()
        optimizer$step()

        if ((i+1) %% 100 == 0) {
            cat(sprintf('Epoch [%d/%d], Step [%d/%d], Loss: %f \n',
                epoch, num_epochs, i+1, total_step, loss$item()))
        }
        i <-  i + 1
    }
}  
## Epoch [1/5], Step [100/600], Loss: 2.243049 
## Epoch [1/5], Step [200/600], Loss: 2.120401 
## Epoch [1/5], Step [300/600], Loss: 2.071192 
## Epoch [1/5], Step [400/600], Loss: 1.958295 
## Epoch [1/5], Step [500/600], Loss: 1.879076 
## Epoch [1/5], Step [600/600], Loss: 1.805387 
## Epoch [2/5], Step [100/600], Loss: 1.750502 
## Epoch [2/5], Step [200/600], Loss: 1.675197 
## Epoch [2/5], Step [300/600], Loss: 1.673908 
## Epoch [2/5], Step [400/600], Loss: 1.551905 
## Epoch [2/5], Step [500/600], Loss: 1.499509 
## Epoch [2/5], Step [600/600], Loss: 1.459108 
## Epoch [3/5], Step [100/600], Loss: 1.423461 
## Epoch [3/5], Step [200/600], Loss: 1.389941 
## Epoch [3/5], Step [300/600], Loss: 1.405990 
## Epoch [3/5], Step [400/600], Loss: 1.284738 
## Epoch [3/5], Step [500/600], Loss: 1.253112 
## Epoch [3/5], Step [600/600], Loss: 1.229778 
## Epoch [4/5], Step [100/600], Loss: 1.206374 
## Epoch [4/5], Step [200/600], Loss: 1.200724 
## Epoch [4/5], Step [300/600], Loss: 1.223891 
## Epoch [4/5], Step [400/600], Loss: 1.104128 
## Epoch [4/5], Step [500/600], Loss: 1.087438 
## Epoch [4/5], Step [600/600], Loss: 1.071864 
## Epoch [5/5], Step [100/600], Loss: 1.057129 
## Epoch [5/5], Step [200/600], Loss: 1.068653 
## Epoch [5/5], Step [300/600], Loss: 1.095702 
## Epoch [5/5], Step [400/600], Loss: 0.976525 
## Epoch [5/5], Step [500/600], Loss: 0.970327 
## Epoch [5/5], Step [600/600], Loss: 0.958247
# Adjust weights and reset gradients
iter_test_loader <- iterate(test_loader)

with(torch$no_grad(), {
    correct <-  0
    total <-  0
    for (obj in iter_test_loader) {
        images <- obj[[1]]   # tensor torch.Size([64, 3, 28, 28])
        labels <- obj[[2]]   # tensor torch.Size([64]), labels from 0 to 9
        images = images$reshape(-1L, 28L*28L)
        # images <- torch$as_tensor(images$reshape(-1L, 28L*28L), dtype=torch$double)
        outputs = model(images)
        .predicted = torch$max(outputs$data, 1L)
        predicted <- .predicted[1L]
        total = total + labels$size(0L)
        correct = correct + sum((predicted$numpy() == labels$numpy()))
    }
    cat(sprintf('Accuracy of the model on the 10000 test images: %f %%', (100 * correct / total)))
  
})
## Accuracy of the model on the 10000 test images: 83.000000 %
#> Accuracy of the model on the 10000 test images: 82.780000 %

End

As you can see, data analysis is no matter R or python.

Reference

Choi, Yunjey. 2018. “Pytorch Tutorials 01-Basics:MNIST Handwritten Digits.” 2018. https://github.com/yunjey/pytorch-tutorial/blob/master/tutorials/01-basics/logistic_regression/main.py.

R Core Team. 2019. R: A Language and Environment for Statistical Computing. Vienna, Austria: R Foundation for Statistical Computing. https://www.R-project.org/.

RStudio Team. 2020. RStudio: Integrated Development Environment for R. Boston, MA: RStudio, PBC. http://www.rstudio.com/.