6. Properties per sequence

Contents

6. Properties per sequence#

Until now we obtained data for each molecule separately. Here we can combine the data per sequence.

[1]:

from pathlib2 import Path
import papylio as pp
import matplotlib.pyplot as plt
import tqdm
import numpy as np
import xarray as xr

from papylio.plugins.sequencing.sequence_generation import generate_sequences

%matplotlib inline

Experiment import#

Note that the sequencing data is automatically imported.

[2]:

experiment_path = Path(r'C:\Users\user\Desktop\SPARXS example dataset')

[3]:

data_per_sequence_path = experiment_path / 'Analysis' / 'Datasets per sequence'
data_per_sequence_path

[3]:

WindowsPath('C:/Users/user/Desktop/SPARXS example dataset/Analysis/Datasets per sequence')

[4]:

save_path = experiment_path / 'Analysis'
save_path

[4]:

WindowsPath('C:/Users/user/Desktop/SPARXS example dataset/Analysis')

[5]:

exp = pp.Experiment(data_per_sequence_path)

Import files: 100%|█████████████████████████████████████████████████████████████| 7948/7948 [00:00<00:00, 42768.82it/s]


Initialize experiment:
C:\Users\user\Desktop\SPARXS example dataset\Analysis\Datasets per sequence

[6]:

file_HJ1 = exp.files.select('TTAGCCGA', 'name')[0]
file_HJ3 = exp.files.select('AATCGGCT', 'name')[0]
file_HJ7 = exp.files.select('GGCGCCGC', 'name')[0]
files_HJ137 = exp.files.select('TTAGCCGA|AATCGGCT|GGCGCCGC', 'name')

[7]:

files = files_HJ137

Create properties per sequence dataset#

Here we create a new dataset to store the properties per sequence.

[8]:

ds = xr.Dataset()
ds['sequence_subset'] = ['TTAGCCGA', 'AATCGGCT', 'GGCGCCGC']
# ds['sequence_subset'] = generate_sequences('NNNNNNNN') # When measuring all variations
ds.set_coords('sequence_subset')
ds.to_netcdf(save_path / 'properties_per_sequence.nc')

ds

[8]:

<xarray.Dataset>
Dimensions:          (sequence_subset: 3)
Coordinates:
  * sequence_subset  (sequence_subset) <U8 'TTAGCCGA' 'AATCGGCT' 'GGCGCCGC'
Data variables:
    *empty*

FRET#

Add the mean and standard deviation of the FRET values for each state, with separate values for 1-state molecules and 2-state molecules.

[15]:

def generate_FRET_classification(file):
    FRET_classification = xr.Dataset()
    FRET_per_state = file.FRET.where(file.classification_binary(positive_states_only=True)).transpose(...,'frame')
    FRET_1_state = FRET_per_state.sel(molecule=(file.number_of_states==1)&file.selected.values).mean(['molecule','frame'])
    FRET_2_states = FRET_per_state.sel(molecule=(file.number_of_states==2)&file.selected.values).mean(['molecule','frame'])
    FRET_std_1_state = FRET_per_state.sel(molecule=(file.number_of_states==1)&file.selected.values).std(['molecule','frame'])
    FRET_std_2_states = FRET_per_state.sel(molecule=(file.number_of_states==2)&file.selected.values).std(['molecule','frame'])

    FRET_classification['mean'] = xr.concat([FRET_1_state, FRET_2_states], dim='number_of_states')\
        .assign_coords(number_of_states=[1,2])
    FRET_classification['std'] = xr.concat([FRET_std_1_state, FRET_std_2_states], dim='number_of_states')\
        .assign_coords(number_of_states=[1,2])
    FRET_classification
    return FRET_classification

FRET_classification = xr.concat(files.serial.map(generate_FRET_classification)(), dim='name')\
    .assign_coords(sequence_subset=files.name)
FRET_classification = FRET_classification.rename(name='sequence_subset')

FRET_classification.to_netcdf(save_path / 'FRET_classification.nc')

FRET_classification

  0%|                                                                                            | 0/3 [00:00<?, ?it/s]

Serial processing

C:\Users\user\miniconda3\envs\trace_analysis\lib\site-packages\numpy\lib\nanfunctions.py:1879: RuntimeWarning: Degrees of freedom <= 0 for slice.
  var = nanvar(a, axis=axis, dtype=dtype, out=out, ddof=ddof,
 33%|████████████████████████████                                                        | 1/3 [00:00<00:00,  3.26it/s]C:\Users\user\miniconda3\envs\trace_analysis\lib\site-packages\numpy\lib\nanfunctions.py:1879: RuntimeWarning: Degrees of freedom <= 0 for slice.
  var = nanvar(a, axis=axis, dtype=dtype, out=out, ddof=ddof,
 67%|████████████████████████████████████████████████████████                            | 2/3 [00:00<00:00,  3.54it/s]C:\Users\user\miniconda3\envs\trace_analysis\lib\site-packages\numpy\lib\nanfunctions.py:1879: RuntimeWarning: Degrees of freedom <= 0 for slice.
  var = nanvar(a, axis=axis, dtype=dtype, out=out, ddof=ddof,
100%|████████████████████████████████████████████████████████████████████████████████████| 3/3 [00:00<00:00,  3.60it/s]

[16]:

FRET_classification = xr.load_dataset(save_path / 'FRET_classification.nc')
ds = xr.load_dataset(save_path / 'properties_per_sequence.nc')

ds['FRET_1_state'] = FRET_classification['mean'].sel(number_of_states=1, state=0, drop=True)
ds['FRET_std_1_state'] = FRET_classification['std'].sel(number_of_states=1, state=0, drop=True)
ds['FRET_2_states_0'] = FRET_classification['mean'].sel(number_of_states=2, state=0, drop=True)
ds['FRET_2_states_1'] = FRET_classification['mean'].sel(number_of_states=2, state=1, drop=True)
ds['FRET_std_2_states_0'] = FRET_classification['std'].sel(number_of_states=2, state=0, drop=True)
ds['FRET_std_2_states_1'] = FRET_classification['std'].sel(number_of_states=2, state=1, drop=True)

ds.to_netcdf(save_path / 'properties_per_sequence.nc')

ds

The data can be easily converted to an excel sheet.

[17]:

ds.to_pandas().to_excel(save_path / 'properties_per_sequence.xlsx')
ds.to_pandas()

[17]:

	number_of_molecules	number_of_molecules_selected	count_1_state	count_2_states	fraction_2_states	transition_rate_0_1	transition_rate_1_0	FRET_1_state	FRET_std_1_state	FRET_2_states_0	FRET_2_states_1	FRET_std_2_states_0	FRET_std_2_states_1
sequence_subset
TTAGCCGA	669	241	41	199	0.829167	3.722390	8.402450	0.196950	0.125939	0.186073	0.425090	0.083220	0.132375
AATCGGCT	1006	284	55	217	0.797794	7.908979	2.993645	0.228415	0.181038	0.266083	0.507820	0.119068	0.096357
GGCGCCGC	753	238	43	187	0.813043	3.355469	2.901715	0.221436	0.156791	0.200279	0.471952	0.096120	0.109417