Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
137 changes: 117 additions & 20 deletions contentcuration/contentcuration/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -349,25 +349,80 @@ def check_feature_flag(self, flag_name):

def check_channel_space(self, channel):
tree_cte = With(self.get_user_active_trees().distinct(), name="trees")
files_cte = With(
tree_cte.join(
self.files.get_queryset(), contentnode__tree_id=tree_cte.col.tree_id
)
.values("checksum")
.distinct(),
name="files",

user_files_cte = With(
self.files.get_queryset().only(
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The django queryset method only has connection with using model objects, if I understand correctly. Since this doesn't deal with model objects, values seems more appropriate. Under the hood, they may result in the same SELECT query, but I'm unsure.

"id",
"checksum",
"contentnode_id",
"file_format_id",
"file_size",
"preset_id",
),
name="user_files",
)

staging_tree_files = (
self.files.filter(contentnode__tree_id=channel.staging_tree.tree_id)
editable_files_qs = (
user_files_cte.queryset()
.with_cte(tree_cte)
.with_cte(files_cte)
.exclude(Exists(files_cte.queryset().filter(checksum=OuterRef("checksum"))))
.values("checksum")
.distinct()
.with_cte(user_files_cte)
.filter(
Exists(
tree_cte.join(
ContentNode.objects.only("id", "tree_id"),
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

only or values should be unnecessary here because Django should eventually make this subquery (because it uses Exists) simply SELECT 1. Something like .all() should work

tree_id=tree_cte.col.tree_id,
)
.with_cte(tree_cte)
.filter(id=OuterRef("contentnode_id"))
)
)
)

editable_files_qs = self._filter_storage_billable_files(editable_files_qs)

existing_checksums_cte = With(
editable_files_qs.values("checksum", "file_format_id").distinct(),
name="existing_checksums",
)

staging_files_qs = (
user_files_cte.queryset()
.with_cte(user_files_cte)
.filter(
Exists(
ContentNode.objects.only("id").filter(
tree_id=channel.staging_tree.tree_id,
id=OuterRef("contentnode_id"),
)
Comment on lines +393 to +396
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Like we talked about, I think this query adjustment should bring some improvement! Secondly, similar comment about only here

)
)
)

staging_files_qs = self._filter_storage_billable_files(staging_files_qs)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Instead of filtering both editable_files_qs and staging_files_qs, I think we could just filter the resulting queryset after we find the new files (after the checksum check)? I could envision some tradeoffs-- eliminating file formats we won't bother to count later on could reduce the size of the checksum comparison, but it means we do that twice instead of once. Thoughts?


staging_files_qs = (
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Maybe for clarity, call this queryset something else? new_staging_files_qs or something like that, since this is post-comparison with existing checksums

staging_files_qs.with_cte(tree_cte)
.with_cte(existing_checksums_cte)
.exclude(
Exists(
existing_checksums_cte.queryset().filter(
checksum=OuterRef("checksum"),
file_format_id=OuterRef("file_format_id"),
Comment on lines +409 to +410
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It's possible someone could craft two different files, with different formats, but the same checksum. Although I don't know that we need to be concerned about that, i.e. we can filter solely on checksum. We also reduce the results to the ids of distinct checksums, meaning we'd only count one of the files anyway. There was existing potential for this anyway, but I think limited usefulness for exploitation. Any particular scenario you're thinking about?

)
)
)
)

unique_staging_ids = (
staging_files_qs.order_by("checksum", "id")
.distinct("checksum")
.values("id")
)
staged_size = float(
staging_tree_files.aggregate(used=Sum("file_size"))["used"] or 0
staging_files_qs.filter(id__in=Subquery(unique_staging_ids)).aggregate(
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We should keep the in-subquery in mind later during unstable/hotfixes testing. I believe the query planner should make similar decisions to an EXISTS check, but maybe not.

used=Sum("file_size")
)["used"]
or 0
)

if self.get_available_space() < staged_size:
Expand Down Expand Up @@ -410,13 +465,55 @@ def get_user_active_trees(self):
)

def get_user_active_files(self):
cte = With(self.get_user_active_trees().distinct())

return (
cte.join(self.files.get_queryset(), contentnode__tree_id=cte.col.tree_id)
.with_cte(cte)
.values("checksum")
.distinct()
tree_cte = With(self.get_user_active_trees().distinct(), name="trees")

user_files_cte = With(
self.files.get_queryset().only(
"id",
"checksum",
"contentnode_id",
"file_format_id",
"file_size",
"preset_id",
),
name="user_files",
)

base_files_qs = (
user_files_cte.queryset()
.with_cte(tree_cte)
.with_cte(user_files_cte)
.filter(
Exists(
tree_cte.join(
ContentNode.objects.only("id", "tree_id"),
tree_id=tree_cte.col.tree_id,
)
.with_cte(tree_cte)
.filter(id=OuterRef("contentnode_id"))
)
)
)

base_files_qs = self._filter_storage_billable_files(base_files_qs)

unique_file_ids = (
base_files_qs.order_by("checksum", "id").distinct("checksum").values("id")
)

files_qs = base_files_qs.filter(id__in=Subquery(unique_file_ids))

return files_qs

def _filter_storage_billable_files(self, queryset):
"""
Perseus exports would not be included in storage calculations.
"""
if queryset is None:
return queryset
return queryset.exclude(file_format_id__isnull=True).exclude(
file_format_id=file_formats.PERSEUS
Copy link
Member

@rtibbles rtibbles Dec 12, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Not an immediate concern, but just a heads that when QTI assessments are more broadly available, and we are generating QTI ZIP files, then we may need to filter these too (and it would need to be on the format preset, rather than the file format id, because the format id would be 'zip'!)

)

def get_space_used(self, active_files=None):
Expand Down
55 changes: 55 additions & 0 deletions contentcuration/contentcuration/tests/test_files.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
from django.db.models import Exists
from django.db.models import OuterRef
from le_utils.constants import content_kinds
from le_utils.constants import file_formats
from mock import patch

from .base import BaseAPITestCase
Expand Down Expand Up @@ -232,3 +233,57 @@ def test_check_staged_space__exists(self):
) as get_available_staged_space:
get_available_staged_space.return_value = 0
self.assertTrue(self.user.check_staged_space(100, f.checksum))

def test_check_channel_space_ignores_perseus_exports(self):
with mock.patch("contentcuration.utils.user.calculate_user_storage"):
self.node_file.file_format_id = file_formats.PERSEUS
self.node_file.file_size = self.user.disk_space + 1
self.node_file.checksum = uuid4().hex
self.node_file.uploaded_by = self.user
self.node_file.save(set_by_file_on_disk=False)

try:
self.user.check_channel_space(self.staged_channel)
except PermissionDenied:
self.fail("Perseus exports should not count against staging space")


class UserStorageUsageTestCase(StudioTestCase):
def setUp(self):
super().setUpBase()
self.contentnode = (
self.channel.main_tree.get_descendants(include_self=True)
.filter(files__isnull=False)
.first()
)
self.assertIsNotNone(self.contentnode)
self.base_file = self.contentnode.files.first()
self.assertIsNotNone(self.base_file)

def _create_file(self, *, file_format, size):
file_record = File(
contentnode=self.contentnode,
checksum=uuid4().hex,
file_format_id=file_format,
file_size=size,
uploaded_by=self.user,
)
file_record.save(set_by_file_on_disk=False)
return file_record

def test_get_space_used_excludes_perseus_exports(self):
baseline_usage = self.user.get_space_used()

perseus_size = 125
with mock.patch("contentcuration.utils.user.calculate_user_storage"):
self._create_file(file_format=file_formats.PERSEUS, size=perseus_size)
self.assertEqual(self.user.get_space_used(), baseline_usage)

non_perseus_size = 275
with mock.patch("contentcuration.utils.user.calculate_user_storage"):
self._create_file(
file_format=self.base_file.file_format_id, size=non_perseus_size
)

expected_usage = baseline_usage + non_perseus_size
self.assertEqual(self.user.get_space_used(), expected_usage)