6. Properties per sequence#

Until now we obtained data for each molecule separately. Here we can combine the data per sequence.

[1]:
from pathlib2 import Path
import papylio as pp
import matplotlib.pyplot as plt
import tqdm
import numpy as np
import xarray as xr

from papylio.plugins.sequencing.sequence_generation import generate_sequences

%matplotlib inline

Experiment import#

Note that the sequencing data is automatically imported.

[2]:
experiment_path = Path(r'C:\Users\user\Desktop\SPARXS example dataset')
[3]:
data_per_sequence_path = experiment_path / 'Analysis' / 'Datasets per sequence'
data_per_sequence_path
[3]:
WindowsPath('C:/Users/user/Desktop/SPARXS example dataset/Analysis/Datasets per sequence')
[4]:
save_path = experiment_path / 'Analysis'
save_path
[4]:
WindowsPath('C:/Users/user/Desktop/SPARXS example dataset/Analysis')
[5]:
exp = pp.Experiment(data_per_sequence_path)
Import files: 100%|█████████████████████████████████████████████████████████████| 7948/7948 [00:00<00:00, 42768.82it/s]

Initialize experiment:
C:\Users\user\Desktop\SPARXS example dataset\Analysis\Datasets per sequence
[6]:
file_HJ1 = exp.files.select('TTAGCCGA', 'name')[0]
file_HJ3 = exp.files.select('AATCGGCT', 'name')[0]
file_HJ7 = exp.files.select('GGCGCCGC', 'name')[0]
files_HJ137 = exp.files.select('TTAGCCGA|AATCGGCT|GGCGCCGC', 'name')
[7]:
files = files_HJ137

Create properties per sequence dataset#

Here we create a new dataset to store the properties per sequence.

[8]:
ds = xr.Dataset()
ds['sequence_subset'] = ['TTAGCCGA', 'AATCGGCT', 'GGCGCCGC']
# ds['sequence_subset'] = generate_sequences('NNNNNNNN') # When measuring all variations
ds.set_coords('sequence_subset')
ds.to_netcdf(save_path / 'properties_per_sequence.nc')

ds
[8]:
<xarray.Dataset>
Dimensions:          (sequence_subset: 3)
Coordinates:
  * sequence_subset  (sequence_subset) <U8 'TTAGCCGA' 'AATCGGCT' 'GGCGCCGC'
Data variables:
    *empty*

Number of molecules#

Add the total number of molecules and the number of selected molecules

[9]:
ds = xr.load_dataset(save_path / 'properties_per_sequence.nc')
number_of_molecules_selected = []
number_of_molecules = []
sequence_subset = []
for file in tqdm.tqdm(files):
    number_of_molecules_selected.append(file.number_of_selected_molecules)
    number_of_molecules.append(file.number_of_molecules)
    sequence_subset.append(file.name)

ds['number_of_molecules'] = xr.DataArray(number_of_molecules, dims=('sequence_subset'),
                                               coords={'sequence_subset': sequence_subset})
ds['number_of_molecules_selected'] = xr.DataArray(number_of_molecules_selected, dims=('sequence_subset'),
                                                  coords={'sequence_subset': sequence_subset})

ds.to_netcdf(save_path / 'properties_per_sequence.nc')

ds
100%|████████████████████████████████████████████████████████████████████████████████████| 3/3 [00:00<00:00, 21.23it/s]
[9]:
<xarray.Dataset>
Dimensions:                       (sequence_subset: 3)
Coordinates:
  * sequence_subset               (sequence_subset) object 'TTAGCCGA' ... 'GG...
Data variables:
    number_of_molecules           (sequence_subset) int32 669 1006 753
    number_of_molecules_selected  (sequence_subset) int32 241 284 238

Number of states#

Add the number of molecules with one or two states, and the fraction of molecules that show two states.

[10]:
state_count = files.serial.state_count(selected=True, states=np.arange(3))
state_count = state_count.rename(name='sequence_subset')

state_count.to_netcdf(save_path / 'state_count.nc')

state_count
100%|████████████████████████████████████████████████████████████████████████████████████| 3/3 [00:00<00:00, 23.08it/s]
Serial processing

[10]:
<xarray.DataArray 'state_count' (sequence_subset: 3, number_of_states: 3)>
array([[ 12,  55, 217],
       [  8,  43, 187],
       [  1,  41, 199]])
Coordinates:
  * sequence_subset   (sequence_subset) <U8 'AATCGGCT' 'GGCGCCGC' 'TTAGCCGA'
  * number_of_states  (number_of_states) int32 0 1 2
[11]:
state_count = xr.load_dataarray(save_path / 'state_count.nc')

count_1_state = state_count.sel(number_of_states=1, drop=True)
count_1_state.name = 'count_1_state'
count_1_state.attrs['unit'] = ''

count_2_states = state_count.sel(number_of_states=2, drop=True)
count_2_states.name = 'count_2_states'
count_2_states.attrs['unit'] = ''

fraction_2_states = count_2_states / (count_1_state + count_2_states)
fraction_2_states.name = 'fraction_2_states'
fraction_2_states.attrs['unit'] = ''

ds = xr.load_dataset(save_path / 'properties_per_sequence.nc')

ds['count_1_state'] = count_1_state
ds['count_2_states'] = count_2_states
ds['fraction_2_states'] = fraction_2_states

ds.to_netcdf(save_path / 'properties_per_sequence.nc')

ds
[11]:
<xarray.Dataset>
Dimensions:                       (sequence_subset: 3)
Coordinates:
  * sequence_subset               (sequence_subset) object 'TTAGCCGA' ... 'GG...
Data variables:
    number_of_molecules           (sequence_subset) int32 669 1006 753
    number_of_molecules_selected  (sequence_subset) int32 241 284 238
    count_1_state                 (sequence_subset) int32 41 55 43
    count_2_states                (sequence_subset) int32 199 217 187
    fraction_2_states             (sequence_subset) float64 0.8292 0.7978 0.813

Transition rates#

Add the mean and standard deviation of the transition rates among the molecules

[12]:
files.name.data
[12]:
['AATCGGCT', 'GGCGCCGC', 'TTAGCCGA']
[13]:
transition_rates_hmm = xr.Dataset(coords={'sequence_subset': files.name.data})
transition_rates_mean_hmm = files.get_variable('transition_rate', selected=True, average='molecule')
transition_rates_mean_hmm = transition_rates_mean_hmm.rename(name='sequence_subset')
transition_rates_mean_hmm.attrs['unit'] = '/s'
transition_rates_hmm['mean'] = transition_rates_mean_hmm

def transition_rate_std_hmm(file):
    return file.get_variable('transition_rate', selected=True).std(dim='molecule').expand_dims({'sequence_subset': [file.name]}, 0)

transition_rates_std_hmm = xr.concat(files.map(transition_rate_std_hmm)(), dim='sequence_subset')
transition_rates_std_hmm.attrs['unit'] = '/s'
transition_rates_hmm['std'] = transition_rates_std_hmm

transition_rates_hmm.to_netcdf(save_path / 'transition_rates_hmm.nc')

transition_rates_hmm
100%|████████████████████████████████████████████████████████████████████████████████████| 3/3 [00:00<00:00, 23.42it/s]
  0%|                                                                                            | 0/3 [00:00<?, ?it/s]
Serial processing
Serial processing
100%|████████████████████████████████████████████████████████████████████████████████████| 3/3 [00:00<00:00, 23.88it/s]
[13]:
<xarray.Dataset>
Dimensions:          (sequence_subset: 3, from_state: 2, to_state: 2)
Coordinates:
  * sequence_subset  (sequence_subset) object 'AATCGGCT' 'GGCGCCGC' 'TTAGCCGA'
Dimensions without coordinates: from_state, to_state
Data variables:
    mean             (sequence_subset, from_state, to_state) float32 -6.31 .....
    std              (sequence_subset, from_state, to_state) float32 4.638 .....
[14]:
transition_rates_hmm = xr.load_dataset(save_path / 'transition_rates_hmm.nc')
ds = xr.load_dataset(save_path / 'properties_per_sequence.nc')

ds['transition_rate_0_1'] = transition_rates_hmm['mean'].sel(from_state=0, to_state=1, drop=True)
ds['transition_rate_0_1'].attrs['unit'] = '/s'
ds['transition_rate_1_0'] = transition_rates_hmm['mean'].sel(from_state=1, to_state=0, drop=True)
ds['transition_rate_1_0'].attrs['unit'] = '/s'

ds.to_netcdf(save_path / 'properties_per_sequence.nc')

ds
[14]:
<xarray.Dataset>
Dimensions:                       (sequence_subset: 3)
Coordinates:
  * sequence_subset               (sequence_subset) object 'TTAGCCGA' ... 'GG...
Data variables:
    number_of_molecules           (sequence_subset) int32 669 1006 753
    number_of_molecules_selected  (sequence_subset) int32 241 284 238
    count_1_state                 (sequence_subset) int32 41 55 43
    count_2_states                (sequence_subset) int32 199 217 187
    fraction_2_states             (sequence_subset) float64 0.8292 0.7978 0.813
    transition_rate_0_1           (sequence_subset) float32 3.722 7.909 3.355
    transition_rate_1_0           (sequence_subset) float32 8.402 2.994 2.902

FRET#

Add the mean and standard deviation of the FRET values for each state, with separate values for 1-state molecules and 2-state molecules.

[15]:
def generate_FRET_classification(file):
    FRET_classification = xr.Dataset()
    FRET_per_state = file.FRET.where(file.classification_binary(positive_states_only=True)).transpose(...,'frame')
    FRET_1_state = FRET_per_state.sel(molecule=(file.number_of_states==1)&file.selected.values).mean(['molecule','frame'])
    FRET_2_states = FRET_per_state.sel(molecule=(file.number_of_states==2)&file.selected.values).mean(['molecule','frame'])
    FRET_std_1_state = FRET_per_state.sel(molecule=(file.number_of_states==1)&file.selected.values).std(['molecule','frame'])
    FRET_std_2_states = FRET_per_state.sel(molecule=(file.number_of_states==2)&file.selected.values).std(['molecule','frame'])

    FRET_classification['mean'] = xr.concat([FRET_1_state, FRET_2_states], dim='number_of_states')\
        .assign_coords(number_of_states=[1,2])
    FRET_classification['std'] = xr.concat([FRET_std_1_state, FRET_std_2_states], dim='number_of_states')\
        .assign_coords(number_of_states=[1,2])
    FRET_classification
    return FRET_classification

FRET_classification = xr.concat(files.serial.map(generate_FRET_classification)(), dim='name')\
    .assign_coords(sequence_subset=files.name)
FRET_classification = FRET_classification.rename(name='sequence_subset')

FRET_classification.to_netcdf(save_path / 'FRET_classification.nc')

FRET_classification
  0%|                                                                                            | 0/3 [00:00<?, ?it/s]
Serial processing
C:\Users\user\miniconda3\envs\trace_analysis\lib\site-packages\numpy\lib\nanfunctions.py:1879: RuntimeWarning: Degrees of freedom <= 0 for slice.
  var = nanvar(a, axis=axis, dtype=dtype, out=out, ddof=ddof,
 33%|████████████████████████████                                                        | 1/3 [00:00<00:00,  3.26it/s]C:\Users\user\miniconda3\envs\trace_analysis\lib\site-packages\numpy\lib\nanfunctions.py:1879: RuntimeWarning: Degrees of freedom <= 0 for slice.
  var = nanvar(a, axis=axis, dtype=dtype, out=out, ddof=ddof,
 67%|████████████████████████████████████████████████████████                            | 2/3 [00:00<00:00,  3.54it/s]C:\Users\user\miniconda3\envs\trace_analysis\lib\site-packages\numpy\lib\nanfunctions.py:1879: RuntimeWarning: Degrees of freedom <= 0 for slice.
  var = nanvar(a, axis=axis, dtype=dtype, out=out, ddof=ddof,
100%|████████████████████████████████████████████████████████████████████████████████████| 3/3 [00:00<00:00,  3.60it/s]
[15]:
<xarray.Dataset>
Dimensions:           (state: 2, number_of_states: 2, sequence_subset: 3)
Coordinates:
  * state             (state) int8 0 1
  * number_of_states  (number_of_states) int32 1 2
  * sequence_subset   (sequence_subset) <U8 'AATCGGCT' 'GGCGCCGC' 'TTAGCCGA'
Data variables:
    mean              (sequence_subset, number_of_states, state) float64 0.22...
    std               (sequence_subset, number_of_states, state) float64 0.18...
[16]:
FRET_classification = xr.load_dataset(save_path / 'FRET_classification.nc')
ds = xr.load_dataset(save_path / 'properties_per_sequence.nc')

ds['FRET_1_state'] = FRET_classification['mean'].sel(number_of_states=1, state=0, drop=True)
ds['FRET_std_1_state'] = FRET_classification['std'].sel(number_of_states=1, state=0, drop=True)
ds['FRET_2_states_0'] = FRET_classification['mean'].sel(number_of_states=2, state=0, drop=True)
ds['FRET_2_states_1'] = FRET_classification['mean'].sel(number_of_states=2, state=1, drop=True)
ds['FRET_std_2_states_0'] = FRET_classification['std'].sel(number_of_states=2, state=0, drop=True)
ds['FRET_std_2_states_1'] = FRET_classification['std'].sel(number_of_states=2, state=1, drop=True)

ds.to_netcdf(save_path / 'properties_per_sequence.nc')

ds
[16]:
<xarray.Dataset>
Dimensions:                       (sequence_subset: 3)
Coordinates:
  * sequence_subset               (sequence_subset) object 'TTAGCCGA' ... 'GG...
Data variables: (12/13)
    number_of_molecules           (sequence_subset) int32 669 1006 753
    number_of_molecules_selected  (sequence_subset) int32 241 284 238
    count_1_state                 (sequence_subset) int32 41 55 43
    count_2_states                (sequence_subset) int32 199 217 187
    fraction_2_states             (sequence_subset) float64 0.8292 0.7978 0.813
    transition_rate_0_1           (sequence_subset) float32 3.722 7.909 3.355
    ...                            ...
    FRET_1_state                  (sequence_subset) float64 0.197 0.2284 0.2214
    FRET_std_1_state              (sequence_subset) float64 0.1259 0.181 0.1568
    FRET_2_states_0               (sequence_subset) float64 0.1861 0.2661 0.2003
    FRET_2_states_1               (sequence_subset) float64 0.4251 0.5078 0.472
    FRET_std_2_states_0           (sequence_subset) float64 0.08322 ... 0.09612
    FRET_std_2_states_1           (sequence_subset) float64 0.1324 ... 0.1094

The data can be easily converted to an excel sheet.

[17]:
ds.to_pandas().to_excel(save_path / 'properties_per_sequence.xlsx')
ds.to_pandas()
[17]:
number_of_molecules number_of_molecules_selected count_1_state count_2_states fraction_2_states transition_rate_0_1 transition_rate_1_0 FRET_1_state FRET_std_1_state FRET_2_states_0 FRET_2_states_1 FRET_std_2_states_0 FRET_std_2_states_1
sequence_subset
TTAGCCGA 669 241 41 199 0.829167 3.722390 8.402450 0.196950 0.125939 0.186073 0.425090 0.083220 0.132375
AATCGGCT 1006 284 55 217 0.797794 7.908979 2.993645 0.228415 0.181038 0.266083 0.507820 0.119068 0.096357
GGCGCCGC 753 238 43 187 0.813043 3.355469 2.901715 0.221436 0.156791 0.200279 0.471952 0.096120 0.109417