From 3d727a9e01a9cb549e6e60b152e44efe1546c28a Mon Sep 17 00:00:00 2001 From: Mauro Carvalho Chehab Date: Wed, 28 Mar 2018 13:59:22 -0400 Subject: [PATCH 001/146] media: v4l2-compat-ioctl32: don't oops on overlay commit 85ea29f19eab56ec16ec6b92bc67305998706afa upstream. At put_v4l2_window32(), it tries to access kp->clips. However, kp points to an userspace pointer. So, it should be obtained via get_user(), otherwise it can OOPS: vivid-000: ================== END STATUS ================== BUG: unable to handle kernel paging request at 00000000fffb18e0 IP: [] __put_v4l2_format32+0x169/0x220 [videodev] PGD 3f5776067 PUD 3f576f067 PMD 3f5769067 PTE 800000042548f067 Oops: 0001 [#1] SMP Modules linked in: vivid videobuf2_vmalloc videobuf2_memops v4l2_dv_timings videobuf2_core v4l2_common videodev media xt_CHECKSUM iptable_mangle ipt_MASQUERADE nf_nat_masquerade_ipv4 iptable_nat nf_nat_ipv4 nf_nat nf_conntrack_ipv4 nf_defrag_ipv4 xt_conntrack nf_conntrack tun bridge stp llc ebtable_filter ebtables ip6table_filter ip6_tables bluetooth rfkill binfmt_misc snd_hda_codec_hdmi i915 snd_hda_intel snd_hda_controller snd_hda_codec intel_rapl x86_pkg_temp_thermal snd_hwdep intel_powerclamp snd_pcm coretemp snd_seq_midi kvm_intel kvm snd_seq_midi_event snd_rawmidi i2c_algo_bit drm_kms_helper snd_seq drm crct10dif_pclmul e1000e snd_seq_device crc32_pclmul snd_timer ghash_clmulni_intel snd mei_me mei ptp pps_core soundcore lpc_ich video crc32c_intel [last unloaded: media] CPU: 2 PID: 28332 Comm: v4l2-compliance Not tainted 3.18.102+ #107 Hardware name: /NUC5i7RYB, BIOS RYBDWi35.86A.0364.2017.0511.0949 05/11/2017 task: ffff8804293f8000 ti: ffff8803f5640000 task.ti: ffff8803f5640000 RIP: 0010:[] [] __put_v4l2_format32+0x169/0x220 [videodev] RSP: 0018:ffff8803f5643e28 EFLAGS: 00010246 RAX: 0000000000000000 RBX: 0000000000000000 RCX: 00000000fffb1ab4 RDX: 00000000fffb1a68 RSI: 00000000fffb18d8 RDI: 00000000fffb1aa8 RBP: ffff8803f5643e48 R08: 0000000000000001 R09: ffff8803f54b0378 R10: 0000000000000000 R11: 0000000000000168 R12: 00000000fffb18c0 R13: 00000000fffb1a94 R14: 00000000fffb18c8 R15: 0000000000000000 FS: 0000000000000000(0000) GS:ffff880456d00000(0063) knlGS:00000000f7100980 CS: 0010 DS: 002b ES: 002b CR0: 0000000080050033 CR2: 00000000fffb18e0 CR3: 00000003f552b000 CR4: 00000000003407e0 Stack: 00000000fffb1a94 00000000c0cc5640 0000000000000056 ffff8804274f3600 ffff8803f5643ed0 ffffffffc0547e16 0000000000000003 ffff8803f5643eb0 ffffffff81301460 ffff88009db44b01 ffff880441942520 ffff8800c0d05640 Call Trace: [] v4l2_compat_ioctl32+0x12d6/0x1b1d [videodev] [] ? file_has_perm+0x70/0xc0 [] compat_SyS_ioctl+0xec/0x1200 [] sysenter_dispatch+0x7/0x21 Code: 00 00 48 8b 80 48 c0 ff ff 48 83 e8 38 49 39 c6 0f 87 2b ff ff ff 49 8d 45 1c e8 a3 ce e3 c0 85 c0 0f 85 1a ff ff ff 41 8d 40 ff <4d> 8b 64 24 20 41 89 d5 48 8d 44 40 03 4d 8d 34 c4 eb 15 0f 1f RIP [] __put_v4l2_format32+0x169/0x220 [videodev] RSP CR2: 00000000fffb18e0 Tested with vivid driver on Kernel v3.18.102. Same bug happens upstream too: BUG: KASAN: user-memory-access in __put_v4l2_format32+0x98/0x4d0 [videodev] Read of size 8 at addr 00000000ffe48400 by task v4l2-compliance/8713 CPU: 0 PID: 8713 Comm: v4l2-compliance Not tainted 4.16.0-rc4+ #108 Hardware name: /NUC5i7RYB, BIOS RYBDWi35.86A.0364.2017.0511.0949 05/11/2017 Call Trace: dump_stack+0x5c/0x7c kasan_report+0x164/0x380 ? __put_v4l2_format32+0x98/0x4d0 [videodev] __put_v4l2_format32+0x98/0x4d0 [videodev] v4l2_compat_ioctl32+0x1aec/0x27a0 [videodev] ? __fsnotify_inode_delete+0x20/0x20 ? __put_v4l2_format32+0x4d0/0x4d0 [videodev] compat_SyS_ioctl+0x646/0x14d0 ? do_ioctl+0x30/0x30 do_fast_syscall_32+0x191/0x3f4 entry_SYSENTER_compat+0x6b/0x7a ================================================================== Disabling lock debugging due to kernel taint BUG: unable to handle kernel paging request at 00000000ffe48400 IP: __put_v4l2_format32+0x98/0x4d0 [videodev] PGD 3a22fb067 P4D 3a22fb067 PUD 39b6f0067 PMD 39b6f1067 PTE 80000003256af067 Oops: 0001 [#1] SMP KASAN Modules linked in: vivid videobuf2_vmalloc videobuf2_dma_contig videobuf2_memops v4l2_tpg v4l2_dv_timings videobuf2_v4l2 videobuf2_common v4l2_common videodev xt_CHECKSUM iptable_mangle ipt_MASQUERADE nf_nat_masquerade_ipv4 iptable_nat nf_nat_ipv4 nf_nat nf_conntrack_ipv4 nf_defrag_ipv4 xt_conntrack nf_conntrack libcrc32c tun bridge stp llc ebtable_filter ebtables ip6table_filter ip6_tables bluetooth rfkill ecdh_generic binfmt_misc snd_hda_codec_hdmi intel_rapl x86_pkg_temp_thermal intel_powerclamp i915 coretemp snd_hda_intel snd_hda_codec kvm_intel snd_hwdep snd_hda_core kvm snd_pcm irqbypass crct10dif_pclmul crc32_pclmul snd_seq_midi ghash_clmulni_intel snd_seq_midi_event i2c_algo_bit intel_cstate snd_rawmidi intel_uncore snd_seq drm_kms_helper e1000e snd_seq_device snd_timer intel_rapl_perf drm ptp snd mei_me mei lpc_ich pps_core soundcore video crc32c_intel CPU: 0 PID: 8713 Comm: v4l2-compliance Tainted: G B 4.16.0-rc4+ #108 Hardware name: /NUC5i7RYB, BIOS RYBDWi35.86A.0364.2017.0511.0949 05/11/2017 RIP: 0010:__put_v4l2_format32+0x98/0x4d0 [videodev] RSP: 0018:ffff8803b9be7d30 EFLAGS: 00010282 RAX: 0000000000000000 RBX: ffff8803ac983e80 RCX: ffffffff8cd929f2 RDX: 1ffffffff1d0a149 RSI: 0000000000000297 RDI: 0000000000000297 RBP: 00000000ffe485c0 R08: fffffbfff1cf5123 R09: ffffffff8e7a8948 R10: 0000000000000001 R11: fffffbfff1cf5122 R12: 00000000ffe483e0 R13: 00000000ffe485c4 R14: ffff8803ac985918 R15: 00000000ffe483e8 FS: 0000000000000000(0000) GS:ffff880407400000(0063) knlGS:00000000f7a46980 CS: 0010 DS: 002b ES: 002b CR0: 0000000080050033 CR2: 00000000ffe48400 CR3: 00000003a83f2003 CR4: 00000000003606f0 Call Trace: v4l2_compat_ioctl32+0x1aec/0x27a0 [videodev] ? __fsnotify_inode_delete+0x20/0x20 ? __put_v4l2_format32+0x4d0/0x4d0 [videodev] compat_SyS_ioctl+0x646/0x14d0 ? do_ioctl+0x30/0x30 do_fast_syscall_32+0x191/0x3f4 entry_SYSENTER_compat+0x6b/0x7a Code: 4c 89 f7 4d 8d 7c 24 08 e8 e6 a4 69 cb 48 8b 83 98 1a 00 00 48 83 e8 10 49 39 c7 0f 87 9d 01 00 00 49 8d 7c 24 20 e8 c8 a4 69 cb <4d> 8b 74 24 20 4c 89 ef 4c 89 fe ba 10 00 00 00 e8 23 d9 08 cc RIP: __put_v4l2_format32+0x98/0x4d0 [videodev] RSP: ffff8803b9be7d30 CR2: 00000000ffe48400 cc: stable@vger.kernel.org Signed-off-by: Mauro Carvalho Chehab Reviewed-by: Sakari Ailus Reviewed-by: Hans Verkuil Signed-off-by: Mauro Carvalho Chehab Signed-off-by: Greg Kroah-Hartman --- drivers/media/v4l2-core/v4l2-compat-ioctl32.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/media/v4l2-core/v4l2-compat-ioctl32.c b/drivers/media/v4l2-core/v4l2-compat-ioctl32.c index e710493c0..9aa452555 100644 --- a/drivers/media/v4l2-core/v4l2-compat-ioctl32.c +++ b/drivers/media/v4l2-core/v4l2-compat-ioctl32.c @@ -88,7 +88,7 @@ static int get_v4l2_window32(struct v4l2_window *kp, struct v4l2_window32 __user static int put_v4l2_window32(struct v4l2_window *kp, struct v4l2_window32 __user *up) { - struct v4l2_clip __user *kclips = kp->clips; + struct v4l2_clip __user *kclips; struct v4l2_clip32 __user *uclips; u32 n = kp->clipcount; compat_caddr_t p; @@ -103,6 +103,8 @@ static int put_v4l2_window32(struct v4l2_window *kp, struct v4l2_window32 __user if (!kp->clipcount) return 0; + if (get_user(kclips, &kp->clips)) + return -EFAULT; if (get_user(p, &up->clips)) return -EFAULT; uclips = compat_ptr(p); From f764b4b61eb4bb75a15b1d16864a84cf4ef96f2e Mon Sep 17 00:00:00 2001 From: Helge Deller Date: Sun, 25 Mar 2018 23:53:22 +0200 Subject: [PATCH 002/146] parisc: Fix out of array access in match_pci_device() commit 615b2665fd20c327b631ff1e79426775de748094 upstream. As found by the ubsan checker, the value of the 'index' variable can be out of range for the bc[] array: UBSAN: Undefined behaviour in arch/parisc/kernel/drivers.c:655:21 index 6 is out of range for type 'char [6]' Backtrace: [<104fa850>] __ubsan_handle_out_of_bounds+0x68/0x80 [<1019d83c>] check_parent+0xc0/0x170 [<1019d91c>] descend_children+0x30/0x6c [<1059e164>] device_for_each_child+0x60/0x98 [<1019cd54>] parse_tree_node+0x40/0x54 [<1019d86c>] check_parent+0xf0/0x170 [<1019d91c>] descend_children+0x30/0x6c [<1059e164>] device_for_each_child+0x60/0x98 [<1019d938>] descend_children+0x4c/0x6c [<1059e164>] device_for_each_child+0x60/0x98 [<1019cd54>] parse_tree_node+0x40/0x54 [<1019cffc>] hwpath_to_device+0xa4/0xc4 Signed-off-by: Helge Deller Cc: stable@vger.kernel.org Signed-off-by: Greg Kroah-Hartman --- arch/parisc/kernel/drivers.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/arch/parisc/kernel/drivers.c b/arch/parisc/kernel/drivers.c index dba508fe1..4f7060ec6 100644 --- a/arch/parisc/kernel/drivers.c +++ b/arch/parisc/kernel/drivers.c @@ -648,6 +648,10 @@ static int match_pci_device(struct device *dev, int index, (modpath->mod == PCI_FUNC(devfn))); } + /* index might be out of bounds for bc[] */ + if (index >= 6) + return 0; + id = PCI_SLOT(pdev->devfn) | (PCI_FUNC(pdev->devfn) << 5); return (modpath->bc[index] == id); } From 45474bef0c9a442557c0d304d4cbc25cfcda1187 Mon Sep 17 00:00:00 2001 From: Adrian Hunter Date: Wed, 7 Mar 2018 16:02:21 +0200 Subject: [PATCH 003/146] perf intel-pt: Fix overlap detection to identify consecutive buffers correctly commit 117db4b27bf08dba412faf3924ba55fe970c57b8 upstream. Overlap detection was not not updating the buffer's 'consecutive' flag. Marking buffers consecutive has the advantage that decoding begins from the start of the buffer instead of the first PSB. Fix overlap detection to identify consecutive buffers correctly. Signed-off-by: Adrian Hunter Cc: Jiri Olsa Cc: stable@vger.kernel.org Link: http://lkml.kernel.org/r/1520431349-30689-2-git-send-email-adrian.hunter@intel.com Signed-off-by: Arnaldo Carvalho de Melo Signed-off-by: Greg Kroah-Hartman --- .../util/intel-pt-decoder/intel-pt-decoder.c | 62 +++++++++---------- .../util/intel-pt-decoder/intel-pt-decoder.h | 2 +- tools/perf/util/intel-pt.c | 5 +- 3 files changed, 34 insertions(+), 35 deletions(-) diff --git a/tools/perf/util/intel-pt-decoder/intel-pt-decoder.c b/tools/perf/util/intel-pt-decoder/intel-pt-decoder.c index eeeae0629..66f53ef06 100644 --- a/tools/perf/util/intel-pt-decoder/intel-pt-decoder.c +++ b/tools/perf/util/intel-pt-decoder/intel-pt-decoder.c @@ -2152,14 +2152,6 @@ const struct intel_pt_state *intel_pt_decode(struct intel_pt_decoder *decoder) return &decoder->state; } -static bool intel_pt_at_psb(unsigned char *buf, size_t len) -{ - if (len < INTEL_PT_PSB_LEN) - return false; - return memmem(buf, INTEL_PT_PSB_LEN, INTEL_PT_PSB_STR, - INTEL_PT_PSB_LEN); -} - /** * intel_pt_next_psb - move buffer pointer to the start of the next PSB packet. * @buf: pointer to buffer pointer @@ -2248,6 +2240,7 @@ static unsigned char *intel_pt_last_psb(unsigned char *buf, size_t len) * @buf: buffer * @len: size of buffer * @tsc: TSC value returned + * @rem: returns remaining size when TSC is found * * Find a TSC packet in @buf and return the TSC value. This function assumes * that @buf starts at a PSB and that PSB+ will contain TSC and so stops if a @@ -2255,7 +2248,8 @@ static unsigned char *intel_pt_last_psb(unsigned char *buf, size_t len) * * Return: %true if TSC is found, false otherwise. */ -static bool intel_pt_next_tsc(unsigned char *buf, size_t len, uint64_t *tsc) +static bool intel_pt_next_tsc(unsigned char *buf, size_t len, uint64_t *tsc, + size_t *rem) { struct intel_pt_pkt packet; int ret; @@ -2266,6 +2260,7 @@ static bool intel_pt_next_tsc(unsigned char *buf, size_t len, uint64_t *tsc) return false; if (packet.type == INTEL_PT_TSC) { *tsc = packet.payload; + *rem = len; return true; } if (packet.type == INTEL_PT_PSBEND) @@ -2316,6 +2311,8 @@ static int intel_pt_tsc_cmp(uint64_t tsc1, uint64_t tsc2) * @len_a: size of first buffer * @buf_b: second buffer * @len_b: size of second buffer + * @consecutive: returns true if there is data in buf_b that is consecutive + * to buf_a * * If the trace contains TSC we can look at the last TSC of @buf_a and the * first TSC of @buf_b in order to determine if the buffers overlap, and then @@ -2328,33 +2325,41 @@ static int intel_pt_tsc_cmp(uint64_t tsc1, uint64_t tsc2) static unsigned char *intel_pt_find_overlap_tsc(unsigned char *buf_a, size_t len_a, unsigned char *buf_b, - size_t len_b) + size_t len_b, bool *consecutive) { uint64_t tsc_a, tsc_b; unsigned char *p; - size_t len; + size_t len, rem_a, rem_b; p = intel_pt_last_psb(buf_a, len_a); if (!p) return buf_b; /* No PSB in buf_a => no overlap */ len = len_a - (p - buf_a); - if (!intel_pt_next_tsc(p, len, &tsc_a)) { + if (!intel_pt_next_tsc(p, len, &tsc_a, &rem_a)) { /* The last PSB+ in buf_a is incomplete, so go back one more */ len_a -= len; p = intel_pt_last_psb(buf_a, len_a); if (!p) return buf_b; /* No full PSB+ => assume no overlap */ len = len_a - (p - buf_a); - if (!intel_pt_next_tsc(p, len, &tsc_a)) + if (!intel_pt_next_tsc(p, len, &tsc_a, &rem_a)) return buf_b; /* No TSC in buf_a => assume no overlap */ } while (1) { /* Ignore PSB+ with no TSC */ - if (intel_pt_next_tsc(buf_b, len_b, &tsc_b) && - intel_pt_tsc_cmp(tsc_a, tsc_b) < 0) - return buf_b; /* tsc_a < tsc_b => no overlap */ + if (intel_pt_next_tsc(buf_b, len_b, &tsc_b, &rem_b)) { + int cmp = intel_pt_tsc_cmp(tsc_a, tsc_b); + + /* Same TSC, so buffers are consecutive */ + if (!cmp && rem_b >= rem_a) { + *consecutive = true; + return buf_b + len_b - (rem_b - rem_a); + } + if (cmp < 0) + return buf_b; /* tsc_a < tsc_b => no overlap */ + } if (!intel_pt_step_psb(&buf_b, &len_b)) return buf_b + len_b; /* No PSB in buf_b => no data */ @@ -2368,6 +2373,8 @@ static unsigned char *intel_pt_find_overlap_tsc(unsigned char *buf_a, * @buf_b: second buffer * @len_b: size of second buffer * @have_tsc: can use TSC packets to detect overlap + * @consecutive: returns true if there is data in buf_b that is consecutive + * to buf_a * * When trace samples or snapshots are recorded there is the possibility that * the data overlaps. Note that, for the purposes of decoding, data is only @@ -2378,7 +2385,7 @@ static unsigned char *intel_pt_find_overlap_tsc(unsigned char *buf_a, */ unsigned char *intel_pt_find_overlap(unsigned char *buf_a, size_t len_a, unsigned char *buf_b, size_t len_b, - bool have_tsc) + bool have_tsc, bool *consecutive) { unsigned char *found; @@ -2390,7 +2397,8 @@ unsigned char *intel_pt_find_overlap(unsigned char *buf_a, size_t len_a, return buf_b; /* No overlap */ if (have_tsc) { - found = intel_pt_find_overlap_tsc(buf_a, len_a, buf_b, len_b); + found = intel_pt_find_overlap_tsc(buf_a, len_a, buf_b, len_b, + consecutive); if (found) return found; } @@ -2405,28 +2413,16 @@ unsigned char *intel_pt_find_overlap(unsigned char *buf_a, size_t len_a, } /* Now len_b >= len_a */ - if (len_b > len_a) { - /* The leftover buffer 'b' must start at a PSB */ - while (!intel_pt_at_psb(buf_b + len_a, len_b - len_a)) { - if (!intel_pt_step_psb(&buf_a, &len_a)) - return buf_b; /* No overlap */ - } - } - while (1) { /* Potential overlap so check the bytes */ found = memmem(buf_a, len_a, buf_b, len_a); - if (found) + if (found) { + *consecutive = true; return buf_b + len_a; + } /* Try again at next PSB in buffer 'a' */ if (!intel_pt_step_psb(&buf_a, &len_a)) return buf_b; /* No overlap */ - - /* The leftover buffer 'b' must start at a PSB */ - while (!intel_pt_at_psb(buf_b + len_a, len_b - len_a)) { - if (!intel_pt_step_psb(&buf_a, &len_a)) - return buf_b; /* No overlap */ - } } } diff --git a/tools/perf/util/intel-pt-decoder/intel-pt-decoder.h b/tools/perf/util/intel-pt-decoder/intel-pt-decoder.h index 02c38fec1..89a3eda6a 100644 --- a/tools/perf/util/intel-pt-decoder/intel-pt-decoder.h +++ b/tools/perf/util/intel-pt-decoder/intel-pt-decoder.h @@ -102,7 +102,7 @@ const struct intel_pt_state *intel_pt_decode(struct intel_pt_decoder *decoder); unsigned char *intel_pt_find_overlap(unsigned char *buf_a, size_t len_a, unsigned char *buf_b, size_t len_b, - bool have_tsc); + bool have_tsc, bool *consecutive); int intel_pt__strerror(int code, char *buf, size_t buflen); diff --git a/tools/perf/util/intel-pt.c b/tools/perf/util/intel-pt.c index 89927b5be..02be74a43 100644 --- a/tools/perf/util/intel-pt.c +++ b/tools/perf/util/intel-pt.c @@ -188,14 +188,17 @@ static void intel_pt_dump_event(struct intel_pt *pt, unsigned char *buf, static int intel_pt_do_fix_overlap(struct intel_pt *pt, struct auxtrace_buffer *a, struct auxtrace_buffer *b) { + bool consecutive = false; void *start; start = intel_pt_find_overlap(a->data, a->size, b->data, b->size, - pt->have_tsc); + pt->have_tsc, &consecutive); if (!start) return -EINVAL; b->use_size = b->data + b->size - start; b->use_data = start; + if (b->use_size && consecutive) + b->consecutive = true; return 0; } From ac9a45b611487735c435caf6f022de0cf7fb93fa Mon Sep 17 00:00:00 2001 From: Adrian Hunter Date: Wed, 7 Mar 2018 16:02:22 +0200 Subject: [PATCH 004/146] perf intel-pt: Fix sync_switch commit 63d8e38f6ae6c36dd5b5ba0e8c112e8861532ea2 upstream. sync_switch is a facility to synchronize decoding more closely with the point in the kernel when the context actually switched. The flag when sync_switch is enabled was global to the decoding, whereas it is really specific to the CPU. The trace data for different CPUs is put on different queues, so add sync_switch to the intel_pt_queue structure and use that in preference to the global setting in the intel_pt structure. That fixes problems decoding one CPU's trace because sync_switch was disabled on a different CPU's queue. Signed-off-by: Adrian Hunter Cc: Jiri Olsa Cc: stable@vger.kernel.org Link: http://lkml.kernel.org/r/1520431349-30689-3-git-send-email-adrian.hunter@intel.com Signed-off-by: Arnaldo Carvalho de Melo Signed-off-by: Greg Kroah-Hartman --- tools/perf/util/intel-pt.c | 32 +++++++++++++++++++++++++------- 1 file changed, 25 insertions(+), 7 deletions(-) diff --git a/tools/perf/util/intel-pt.c b/tools/perf/util/intel-pt.c index 02be74a43..3693cb26e 100644 --- a/tools/perf/util/intel-pt.c +++ b/tools/perf/util/intel-pt.c @@ -125,6 +125,7 @@ struct intel_pt_queue { bool stop; bool step_through_buffers; bool use_buffer_pid_tid; + bool sync_switch; pid_t pid, tid; int cpu; int switch_state; @@ -852,10 +853,12 @@ static int intel_pt_setup_queue(struct intel_pt *pt, if (pt->timeless_decoding || !pt->have_sched_switch) ptq->use_buffer_pid_tid = true; } + + ptq->sync_switch = pt->sync_switch; } if (!ptq->on_heap && - (!pt->sync_switch || + (!ptq->sync_switch || ptq->switch_state != INTEL_PT_SS_EXPECTING_SWITCH_EVENT)) { const struct intel_pt_state *state; int ret; @@ -1238,7 +1241,7 @@ static int intel_pt_sample(struct intel_pt_queue *ptq) if (pt->synth_opts.last_branch) intel_pt_update_last_branch_rb(ptq); - if (!pt->sync_switch) + if (!ptq->sync_switch) return 0; if (intel_pt_is_switch_ip(ptq, state->to_ip)) { @@ -1319,6 +1322,21 @@ static u64 intel_pt_switch_ip(struct intel_pt *pt, u64 *ptss_ip) return switch_ip; } +static void intel_pt_enable_sync_switch(struct intel_pt *pt) +{ + unsigned int i; + + pt->sync_switch = true; + + for (i = 0; i < pt->queues.nr_queues; i++) { + struct auxtrace_queue *queue = &pt->queues.queue_array[i]; + struct intel_pt_queue *ptq = queue->priv; + + if (ptq) + ptq->sync_switch = true; + } +} + static int intel_pt_run_decoder(struct intel_pt_queue *ptq, u64 *timestamp) { const struct intel_pt_state *state = ptq->state; @@ -1335,7 +1353,7 @@ static int intel_pt_run_decoder(struct intel_pt_queue *ptq, u64 *timestamp) if (pt->switch_ip) { intel_pt_log("switch_ip: %"PRIx64" ptss_ip: %"PRIx64"\n", pt->switch_ip, pt->ptss_ip); - pt->sync_switch = true; + intel_pt_enable_sync_switch(pt); } } } @@ -1351,9 +1369,9 @@ static int intel_pt_run_decoder(struct intel_pt_queue *ptq, u64 *timestamp) if (state->err) { if (state->err == INTEL_PT_ERR_NODATA) return 1; - if (pt->sync_switch && + if (ptq->sync_switch && state->from_ip >= pt->kernel_start) { - pt->sync_switch = false; + ptq->sync_switch = false; intel_pt_next_tid(pt, ptq); } if (pt->synth_opts.errors) { @@ -1379,7 +1397,7 @@ static int intel_pt_run_decoder(struct intel_pt_queue *ptq, u64 *timestamp) state->timestamp, state->est_timestamp); ptq->timestamp = state->est_timestamp; /* Use estimated TSC in unknown switch state */ - } else if (pt->sync_switch && + } else if (ptq->sync_switch && ptq->switch_state == INTEL_PT_SS_UNKNOWN && intel_pt_is_switch_ip(ptq, state->to_ip) && ptq->next_tid == -1) { @@ -1526,7 +1544,7 @@ static int intel_pt_sync_switch(struct intel_pt *pt, int cpu, pid_t tid, return 1; ptq = intel_pt_cpu_to_ptq(pt, cpu); - if (!ptq) + if (!ptq || !ptq->sync_switch) return 1; switch (ptq->switch_state) { From 1c1958dd222d4e1ddcfaf5a121aff141a28964ff Mon Sep 17 00:00:00 2001 From: Adrian Hunter Date: Wed, 7 Mar 2018 16:02:23 +0200 Subject: [PATCH 005/146] perf intel-pt: Fix error recovery from missing TIP packet commit 1c196a6c771c47a2faa63d38d913e03284f73a16 upstream. When a TIP packet is expected but there is a different packet, it is an error. However the unexpected packet might be something important like a TSC packet, so after the error, it is necessary to continue from there, rather than the next packet. That is achieved by setting pkt_step to zero. Signed-off-by: Adrian Hunter Cc: Jiri Olsa Cc: stable@vger.kernel.org Link: http://lkml.kernel.org/r/1520431349-30689-4-git-send-email-adrian.hunter@intel.com Signed-off-by: Arnaldo Carvalho de Melo Signed-off-by: Greg Kroah-Hartman --- tools/perf/util/intel-pt-decoder/intel-pt-decoder.c | 1 + 1 file changed, 1 insertion(+) diff --git a/tools/perf/util/intel-pt-decoder/intel-pt-decoder.c b/tools/perf/util/intel-pt-decoder/intel-pt-decoder.c index 66f53ef06..21e7d3dc0 100644 --- a/tools/perf/util/intel-pt-decoder/intel-pt-decoder.c +++ b/tools/perf/util/intel-pt-decoder/intel-pt-decoder.c @@ -1492,6 +1492,7 @@ static int intel_pt_walk_fup_tip(struct intel_pt_decoder *decoder) case INTEL_PT_PSBEND: intel_pt_log("ERROR: Missing TIP after FUP\n"); decoder->pkt_state = INTEL_PT_STATE_ERR3; + decoder->pkt_step = 0; return -ENOENT; case INTEL_PT_OVF: From 3202da2058f7e129f2880cd57290428e3186a635 Mon Sep 17 00:00:00 2001 From: Adrian Hunter Date: Wed, 7 Mar 2018 16:02:24 +0200 Subject: [PATCH 006/146] perf intel-pt: Fix timestamp following overflow commit 91d29b288aed3406caf7c454bf2b898c96cfd177 upstream. timestamp_insn_cnt is used to estimate the timestamp based on the number of instructions since the last known timestamp. If the estimate is not accurate enough decoding might not be correctly synchronized with side-band events causing more trace errors. However there are always timestamps following an overflow, so the estimate is not needed and can indeed result in more errors. Suppress the estimate by setting timestamp_insn_cnt to zero. Signed-off-by: Adrian Hunter Cc: Jiri Olsa Cc: stable@vger.kernel.org Link: http://lkml.kernel.org/r/1520431349-30689-5-git-send-email-adrian.hunter@intel.com Signed-off-by: Arnaldo Carvalho de Melo Signed-off-by: Greg Kroah-Hartman --- tools/perf/util/intel-pt-decoder/intel-pt-decoder.c | 1 + 1 file changed, 1 insertion(+) diff --git a/tools/perf/util/intel-pt-decoder/intel-pt-decoder.c b/tools/perf/util/intel-pt-decoder/intel-pt-decoder.c index 21e7d3dc0..0b540b84f 100644 --- a/tools/perf/util/intel-pt-decoder/intel-pt-decoder.c +++ b/tools/perf/util/intel-pt-decoder/intel-pt-decoder.c @@ -1270,6 +1270,7 @@ static int intel_pt_overflow(struct intel_pt_decoder *decoder) intel_pt_clear_tx_flags(decoder); decoder->have_tma = false; decoder->cbr = 0; + decoder->timestamp_insn_cnt = 0; decoder->pkt_state = INTEL_PT_STATE_ERR_RESYNC; decoder->overflow = true; return -EOVERFLOW; From c28bf8b0d5be880d3f6043fb11a163d389429705 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Fri, 16 Feb 2018 16:26:57 +0100 Subject: [PATCH 007/146] radeon: hide pointless #warning when compile testing MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit commit c02216acf4177c4411d33735c81cad687790fa59 upstream. In randconfig testing, we sometimes get this warning: drivers/gpu/drm/radeon/radeon_object.c: In function 'radeon_bo_create': drivers/gpu/drm/radeon/radeon_object.c:242:2: error: #warning Please enable CONFIG_MTRR and CONFIG_X86_PAT for better performance thanks to write-combining [-Werror=cpp] #warning Please enable CONFIG_MTRR and CONFIG_X86_PAT for better performance \ This is rather annoying since almost all other code produces no build-time output unless we have found a real bug. We already fixed this in the amdgpu driver in commit 31bb90f1cd08 ("drm/amdgpu: shut up #warning for compile testing") by adding a CONFIG_COMPILE_TEST check last year and agreed to do the same here, but both Michel and I then forgot about it until I came across the issue again now. For stable kernels, as this is one of very few remaining randconfig warnings in 4.14. Cc: stable@vger.kernel.org Link: https://patchwork.kernel.org/patch/9550009/ Signed-off-by: Arnd Bergmann Signed-off-by: Michel Dänzer Signed-off-by: Alex Deucher Signed-off-by: Greg Kroah-Hartman --- drivers/gpu/drm/radeon/radeon_object.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/gpu/drm/radeon/radeon_object.c b/drivers/gpu/drm/radeon/radeon_object.c index fb6ad1438..83aee9e81 100644 --- a/drivers/gpu/drm/radeon/radeon_object.c +++ b/drivers/gpu/drm/radeon/radeon_object.c @@ -238,9 +238,10 @@ int radeon_bo_create(struct radeon_device *rdev, * may be slow * See https://bugs.freedesktop.org/show_bug.cgi?id=88758 */ - +#ifndef CONFIG_COMPILE_TEST #warning Please enable CONFIG_MTRR and CONFIG_X86_PAT for better performance \ thanks to write-combining +#endif if (bo->flags & RADEON_GEM_GTT_WC) DRM_INFO_ONCE("Please enable CONFIG_MTRR and CONFIG_X86_PAT for " From 26dae08564982ed741360b0bb9f70744df801050 Mon Sep 17 00:00:00 2001 From: Greg Kroah-Hartman Date: Tue, 17 Apr 2018 14:56:21 +0200 Subject: [PATCH 008/146] Revert "perf tests: Decompress kernel module before objdump" This reverts commit b0761b57e0bf11ada4c45e68f4cba1370363d90d which is commit 94df1040b1e6aacd8dec0ba3c61d7e77cd695f26 upstream. It breaks the build of perf on 4.4.y, so I'm dropping it. Reported-by: Pavlos Parissis Reported-by: Lei Chen Reported-by: Maxime Hadjinlian Cc: Namhyung Kim Cc: Adrian Hunter Cc: Jiri Olsa Cc: David Ahern Cc: Peter Zijlstra Cc: Wang Nan Cc: kernel-team@lge.com Cc: Arnaldo Carvalho de Melo Cc: Sasha Levin Signed-off-by: Greg Kroah-Hartman --- tools/perf/tests/code-reading.c | 20 +------------------- 1 file changed, 1 insertion(+), 19 deletions(-) diff --git a/tools/perf/tests/code-reading.c b/tools/perf/tests/code-reading.c index 6ea4fcfaa..a767a6400 100644 --- a/tools/perf/tests/code-reading.c +++ b/tools/perf/tests/code-reading.c @@ -182,8 +182,6 @@ static int read_object_code(u64 addr, size_t len, u8 cpumode, unsigned char buf2[BUFSZ]; size_t ret_len; u64 objdump_addr; - const char *objdump_name; - char decomp_name[KMOD_DECOMP_LEN]; int ret; pr_debug("Reading object code for memory address: %#"PRIx64"\n", addr); @@ -244,25 +242,9 @@ static int read_object_code(u64 addr, size_t len, u8 cpumode, state->done[state->done_cnt++] = al.map->start; } - objdump_name = al.map->dso->long_name; - if (dso__needs_decompress(al.map->dso)) { - if (dso__decompress_kmodule_path(al.map->dso, objdump_name, - decomp_name, - sizeof(decomp_name)) < 0) { - pr_debug("decompression failed\n"); - return -1; - } - - objdump_name = decomp_name; - } - /* Read the object code using objdump */ objdump_addr = map__rip_2objdump(al.map, al.addr); - ret = read_via_objdump(objdump_name, objdump_addr, buf2, len); - - if (dso__needs_decompress(al.map->dso)) - unlink(objdump_name); - + ret = read_via_objdump(al.map->dso->long_name, objdump_addr, buf2, len); if (ret > 0) { /* * The kernel maps are inaccurate - assume objdump is right in From 1dbb6de57ef7048396b1de391e050a11bd6b0e04 Mon Sep 17 00:00:00 2001 From: Tetsuo Handa Date: Fri, 6 Apr 2018 10:03:17 +0900 Subject: [PATCH 009/146] block/loop: fix deadlock after loop_set_status commit 1e047eaab3bb5564f25b41e9cd3a053009f4e789 upstream. syzbot is reporting deadlocks at __blkdev_get() [1]. ---------------------------------------- [ 92.493919] systemd-udevd D12696 525 1 0x00000000 [ 92.495891] Call Trace: [ 92.501560] schedule+0x23/0x80 [ 92.502923] schedule_preempt_disabled+0x5/0x10 [ 92.504645] __mutex_lock+0x416/0x9e0 [ 92.510760] __blkdev_get+0x73/0x4f0 [ 92.512220] blkdev_get+0x12e/0x390 [ 92.518151] do_dentry_open+0x1c3/0x2f0 [ 92.519815] path_openat+0x5d9/0xdc0 [ 92.521437] do_filp_open+0x7d/0xf0 [ 92.527365] do_sys_open+0x1b8/0x250 [ 92.528831] do_syscall_64+0x6e/0x270 [ 92.530341] entry_SYSCALL_64_after_hwframe+0x42/0xb7 [ 92.931922] 1 lock held by systemd-udevd/525: [ 92.933642] #0: 00000000a2849e25 (&bdev->bd_mutex){+.+.}, at: __blkdev_get+0x73/0x4f0 ---------------------------------------- The reason of deadlock turned out that wait_event_interruptible() in blk_queue_enter() got stuck with bdev->bd_mutex held at __blkdev_put() due to q->mq_freeze_depth == 1. ---------------------------------------- [ 92.787172] a.out S12584 634 633 0x80000002 [ 92.789120] Call Trace: [ 92.796693] schedule+0x23/0x80 [ 92.797994] blk_queue_enter+0x3cb/0x540 [ 92.803272] generic_make_request+0xf0/0x3d0 [ 92.807970] submit_bio+0x67/0x130 [ 92.810928] submit_bh_wbc+0x15e/0x190 [ 92.812461] __block_write_full_page+0x218/0x460 [ 92.815792] __writepage+0x11/0x50 [ 92.817209] write_cache_pages+0x1ae/0x3d0 [ 92.825585] generic_writepages+0x5a/0x90 [ 92.831865] do_writepages+0x43/0xd0 [ 92.836972] __filemap_fdatawrite_range+0xc1/0x100 [ 92.838788] filemap_write_and_wait+0x24/0x70 [ 92.840491] __blkdev_put+0x69/0x1e0 [ 92.841949] blkdev_close+0x16/0x20 [ 92.843418] __fput+0xda/0x1f0 [ 92.844740] task_work_run+0x87/0xb0 [ 92.846215] do_exit+0x2f5/0xba0 [ 92.850528] do_group_exit+0x34/0xb0 [ 92.852018] SyS_exit_group+0xb/0x10 [ 92.853449] do_syscall_64+0x6e/0x270 [ 92.854944] entry_SYSCALL_64_after_hwframe+0x42/0xb7 [ 92.943530] 1 lock held by a.out/634: [ 92.945105] #0: 00000000a2849e25 (&bdev->bd_mutex){+.+.}, at: __blkdev_put+0x3c/0x1e0 ---------------------------------------- The reason of q->mq_freeze_depth == 1 turned out that loop_set_status() forgot to call blk_mq_unfreeze_queue() at error paths for info->lo_encrypt_type != NULL case. ---------------------------------------- [ 37.509497] CPU: 2 PID: 634 Comm: a.out Tainted: G W 4.16.0+ #457 [ 37.513608] Hardware name: VMware, Inc. VMware Virtual Platform/440BX Desktop Reference Platform, BIOS 6.00 05/19/2017 [ 37.518832] RIP: 0010:blk_freeze_queue_start+0x17/0x40 [ 37.521778] RSP: 0018:ffffb0c2013e7c60 EFLAGS: 00010246 [ 37.524078] RAX: 0000000000000000 RBX: ffff8b07b1519798 RCX: 0000000000000000 [ 37.527015] RDX: 0000000000000002 RSI: ffffb0c2013e7cc0 RDI: ffff8b07b1519798 [ 37.529934] RBP: ffffb0c2013e7cc0 R08: 0000000000000008 R09: 47a189966239b898 [ 37.532684] R10: dad78b99b278552f R11: 9332dca72259d5ef R12: ffff8b07acd73678 [ 37.535452] R13: 0000000000004c04 R14: 0000000000000000 R15: ffff8b07b841e940 [ 37.538186] FS: 00007fede33b9740(0000) GS:ffff8b07b8e80000(0000) knlGS:0000000000000000 [ 37.541168] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 37.543590] CR2: 00000000206fdf18 CR3: 0000000130b30006 CR4: 00000000000606e0 [ 37.546410] Call Trace: [ 37.547902] blk_freeze_queue+0x9/0x30 [ 37.549968] loop_set_status+0x67/0x3c0 [loop] [ 37.549975] loop_set_status64+0x3b/0x70 [loop] [ 37.549986] lo_ioctl+0x223/0x810 [loop] [ 37.549995] blkdev_ioctl+0x572/0x980 [ 37.550003] block_ioctl+0x34/0x40 [ 37.550006] do_vfs_ioctl+0xa7/0x6d0 [ 37.550017] ksys_ioctl+0x6b/0x80 [ 37.573076] SyS_ioctl+0x5/0x10 [ 37.574831] do_syscall_64+0x6e/0x270 [ 37.576769] entry_SYSCALL_64_after_hwframe+0x42/0xb7 ---------------------------------------- [1] https://syzkaller.appspot.com/bug?id=cd662bc3f6022c0979d01a262c318fab2ee9b56f Signed-off-by: Tetsuo Handa Reported-by: syzbot Fixes: ecdd09597a572513 ("block/loop: fix race between I/O and set_status") Cc: Ming Lei Cc: Dmitry Vyukov Cc: stable Cc: Jens Axboe Signed-off-by: Jens Axboe Signed-off-by: Greg Kroah-Hartman --- drivers/block/loop.c | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/drivers/block/loop.c b/drivers/block/loop.c index 58c649dd3..2acb4b5fb 100644 --- a/drivers/block/loop.c +++ b/drivers/block/loop.c @@ -1128,11 +1128,15 @@ loop_set_status(struct loop_device *lo, const struct loop_info64 *info) if (info->lo_encrypt_type) { unsigned int type = info->lo_encrypt_type; - if (type >= MAX_LO_CRYPT) - return -EINVAL; + if (type >= MAX_LO_CRYPT) { + err = -EINVAL; + goto exit; + } xfer = xfer_funcs[type]; - if (xfer == NULL) - return -EINVAL; + if (xfer == NULL) { + err = -EINVAL; + goto exit; + } } else xfer = NULL; From 30bd1bc08ad136753d1131d7e5994d3387c59bef Mon Sep 17 00:00:00 2001 From: Julian Wiedmann Date: Mon, 5 Mar 2018 09:39:38 +0100 Subject: [PATCH 010/146] s390/qdio: don't retry EQBS after CCQ 96 commit dae55b6fef58530c13df074bcc182c096609339e upstream. Immediate retry of EQBS after CCQ 96 means that we potentially misreport the state of buffers inspected during the first EQBS call. This occurs when 1. the first EQBS finds all inspected buffers still in the initial state set by the driver (ie INPUT EMPTY or OUTPUT PRIMED), 2. the EQBS terminates early with CCQ 96, and 3. by the time that the second EQBS comes around, the state of those previously inspected buffers has changed. If the state reported by the second EQBS is 'driver-owned', all we know is that the previous buffers are driver-owned now as well. But we can't tell if they all have the same state. So for instance - the second EQBS reports OUTPUT EMPTY, but any number of the previous buffers could be OUTPUT ERROR by now, - the second EQBS reports OUTPUT ERROR, but any number of the previous buffers could be OUTPUT EMPTY by now. Effectively, this can result in both over- and underreporting of errors. If the state reported by the second EQBS is 'HW-owned', that doesn't guarantee that the previous buffers have not been switched to driver-owned in the mean time. So for instance - the second EQBS reports INPUT EMPTY, but any number of the previous buffers could be INPUT PRIMED (or INPUT ERROR) by now. This would result in failure to process pending work on the queue. If it's the final check before yielding initiative, this can cause a (temporary) queue stall due to IRQ avoidance. Fixes: 25f269f17316 ("[S390] qdio: EQBS retry after CCQ 96") Cc: #v3.2+ Signed-off-by: Julian Wiedmann Reviewed-by: Benjamin Block Signed-off-by: Martin Schwidefsky Signed-off-by: Greg Kroah-Hartman --- drivers/s390/cio/qdio_main.c | 11 ++--------- 1 file changed, 2 insertions(+), 9 deletions(-) diff --git a/drivers/s390/cio/qdio_main.c b/drivers/s390/cio/qdio_main.c index 4bb5262f7..da28b1724 100644 --- a/drivers/s390/cio/qdio_main.c +++ b/drivers/s390/cio/qdio_main.c @@ -126,7 +126,7 @@ static inline int qdio_check_ccq(struct qdio_q *q, unsigned int ccq) static int qdio_do_eqbs(struct qdio_q *q, unsigned char *state, int start, int count, int auto_ack) { - int rc, tmp_count = count, tmp_start = start, nr = q->nr, retried = 0; + int rc, tmp_count = count, tmp_start = start, nr = q->nr; unsigned int ccq = 0; qperf_inc(q, eqbs); @@ -149,14 +149,7 @@ static int qdio_do_eqbs(struct qdio_q *q, unsigned char *state, qperf_inc(q, eqbs_partial); DBF_DEV_EVENT(DBF_WARN, q->irq_ptr, "EQBS part:%02x", tmp_count); - /* - * Retry once, if that fails bail out and process the - * extracted buffers before trying again. - */ - if (!retried++) - goto again; - else - return count - tmp_count; + return count - tmp_count; } DBF_ERROR("%4x EQBS ERROR", SCH_NO(q)); From ca8786d0b51d836d229aeb087bbd73fe5ef615ac Mon Sep 17 00:00:00 2001 From: Julian Wiedmann Date: Wed, 7 Mar 2018 14:01:01 +0100 Subject: [PATCH 011/146] s390/qdio: don't merge ERROR output buffers commit 0cf1e05157b9e5530dcc3ca9fec9bf617fc93375 upstream. On an Output queue, both EMPTY and PENDING buffer states imply that the buffer is ready for completion-processing by the upper-layer drivers. So for a non-QEBSM Output queue, get_buf_states() merges mixed batches of PENDING and EMPTY buffers into one large batch of EMPTY buffers. The upper-layer driver (ie. qeth) later distuingishes PENDING from EMPTY by inspecting the slsb_state for QDIO_OUTBUF_STATE_FLAG_PENDING. But the merge logic in get_buf_states() contains a bug that causes us to erronously also merge ERROR buffers into such a batch of EMPTY buffers (ERROR is 0xaf, EMPTY is 0xa1; so ERROR & EMPTY == EMPTY). Effectively, most outbound ERROR buffers are currently discarded silently and processed as if they had succeeded. Note that this affects _all_ non-QEBSM device types, not just IQD with CQ. Fix it by explicitly spelling out the exact conditions for merging. For extracting the "get initial state" part out of the loop, this relies on the fact that get_buf_states() is never called with a count of 0. The QEBSM path already strictly requires this, and the two callers with variable 'count' make sure of it. Fixes: 104ea556ee7f ("qdio: support asynchronous delivery of storage blocks") Cc: #v3.2+ Signed-off-by: Julian Wiedmann Reviewed-by: Ursula Braun Reviewed-by: Benjamin Block Signed-off-by: Martin Schwidefsky Signed-off-by: Greg Kroah-Hartman --- drivers/s390/cio/qdio_main.c | 31 ++++++++++++++++++++----------- 1 file changed, 20 insertions(+), 11 deletions(-) diff --git a/drivers/s390/cio/qdio_main.c b/drivers/s390/cio/qdio_main.c index da28b1724..742ca57ec 100644 --- a/drivers/s390/cio/qdio_main.c +++ b/drivers/s390/cio/qdio_main.c @@ -205,7 +205,10 @@ static int qdio_do_sqbs(struct qdio_q *q, unsigned char state, int start, return 0; } -/* returns number of examined buffers and their common state in *state */ +/* + * Returns number of examined buffers and their common state in *state. + * Requested number of buffers-to-examine must be > 0. + */ static inline int get_buf_states(struct qdio_q *q, unsigned int bufnr, unsigned char *state, unsigned int count, int auto_ack, int merge_pending) @@ -216,17 +219,23 @@ static inline int get_buf_states(struct qdio_q *q, unsigned int bufnr, if (is_qebsm(q)) return qdio_do_eqbs(q, state, bufnr, count, auto_ack); - for (i = 0; i < count; i++) { - if (!__state) { - __state = q->slsb.val[bufnr]; - if (merge_pending && __state == SLSB_P_OUTPUT_PENDING) - __state = SLSB_P_OUTPUT_EMPTY; - } else if (merge_pending) { - if ((q->slsb.val[bufnr] & __state) != __state) - break; - } else if (q->slsb.val[bufnr] != __state) - break; + /* get initial state: */ + __state = q->slsb.val[bufnr]; + if (merge_pending && __state == SLSB_P_OUTPUT_PENDING) + __state = SLSB_P_OUTPUT_EMPTY; + + for (i = 1; i < count; i++) { bufnr = next_buf(bufnr); + + /* merge PENDING into EMPTY: */ + if (merge_pending && + q->slsb.val[bufnr] == SLSB_P_OUTPUT_PENDING && + __state == SLSB_P_OUTPUT_EMPTY) + continue; + + /* stop if next state differs from initial state: */ + if (q->slsb.val[bufnr] != __state) + break; } *state = __state; return i; From f00138efb427cd2a9aa73aa16027e335b00fb86b Mon Sep 17 00:00:00 2001 From: Vasily Gorbik Date: Tue, 3 Apr 2018 16:02:15 +0200 Subject: [PATCH 012/146] s390/ipl: ensure loadparm valid flag is set commit 15deb080a6087b73089139569558965750e69d67 upstream. When loadparm is set in reipl parm block, the kernel should also set DIAG308_FLAGS_LP_VALID flag. This fixes loadparm ignoring during z/VM fcp -> ccw reipl and kvm direct boot -> ccw reipl. Cc: Reviewed-by: Heiko Carstens Signed-off-by: Vasily Gorbik Signed-off-by: Martin Schwidefsky Signed-off-by: Greg Kroah-Hartman --- arch/s390/kernel/ipl.c | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/s390/kernel/ipl.c b/arch/s390/kernel/ipl.c index 42570d8fb..e73979236 100644 --- a/arch/s390/kernel/ipl.c +++ b/arch/s390/kernel/ipl.c @@ -798,6 +798,7 @@ static ssize_t reipl_generic_loadparm_store(struct ipl_parameter_block *ipb, /* copy and convert to ebcdic */ memcpy(ipb->hdr.loadparm, buf, lp_len); ASCEBC(ipb->hdr.loadparm, LOADPARM_LEN); + ipb->hdr.flags |= DIAG308_FLAGS_LP_VALID; return len; } From cd7a7a36236472372a90b828e26564d46a1dd68f Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sun, 8 Apr 2018 11:57:10 -0400 Subject: [PATCH 013/146] getname_kernel() needs to make sure that ->name != ->iname in long case commit 30ce4d1903e1d8a7ccd110860a5eef3c638ed8be upstream. missed it in "kill struct filename.separate" several years ago. Cc: stable@vger.kernel.org Signed-off-by: Al Viro Signed-off-by: Greg Kroah-Hartman --- fs/namei.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/fs/namei.c b/fs/namei.c index 0fcad42e4..de57dd59d 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -219,9 +219,10 @@ getname_kernel(const char * filename) if (len <= EMBEDDED_NAME_MAX) { result->name = (char *)result->iname; } else if (len <= PATH_MAX) { + const size_t size = offsetof(struct filename, iname[1]); struct filename *tmp; - tmp = kmalloc(sizeof(*tmp), GFP_KERNEL); + tmp = kmalloc(size, GFP_KERNEL); if (unlikely(!tmp)) { __putname(result); return ERR_PTR(-ENOMEM); From 46ecb91a2105e565b04e92ac5fd0b3580054f91b Mon Sep 17 00:00:00 2001 From: Sudhir Sreedharan Date: Thu, 15 Feb 2018 12:52:45 +0530 Subject: [PATCH 014/146] rtl8187: Fix NULL pointer dereference in priv->conf_mutex commit 7972326a26b5bf8dc2adac575c4e03ee7e9d193a upstream. This can be reproduced by bind/unbind the driver multiple times in AM3517 board. Analysis revealed that rtl8187_start() was invoked before probe finishes(ie. before the mutex is initialized). INFO: trying to register non-static key. the code is fine but needs lockdep annotation. turning off the locking correctness validator. CPU: 0 PID: 821 Comm: wpa_supplicant Not tainted 4.9.80-dirty #250 Hardware name: Generic AM3517 (Flattened Device Tree) [] (unwind_backtrace) from [] (show_stack+0x10/0x14) [] (show_stack) from [] (register_lock_class+0x4f4/0x55c) [] (register_lock_class) from [] (__lock_acquire+0x74/0x1938) [] (__lock_acquire) from [] (lock_acquire+0xfc/0x23c) [] (lock_acquire) from [] (mutex_lock_nested+0x50/0x3b0) [] (mutex_lock_nested) from [] (rtl8187_start+0x2c/0xd54) [] (rtl8187_start) from [] (drv_start+0xa8/0x320) [] (drv_start) from [] (ieee80211_do_open+0x2bc/0x8e4) [] (ieee80211_do_open) from [] (__dev_open+0xb8/0x120) [] (__dev_open) from [] (__dev_change_flags+0x88/0x14c) [] (__dev_change_flags) from [] (dev_change_flags+0x18/0x48) [] (dev_change_flags) from [] (devinet_ioctl+0x738/0x840) [] (devinet_ioctl) from [] (sock_ioctl+0x164/0x2f4) [] (sock_ioctl) from [] (do_vfs_ioctl+0x8c/0x9d0) [] (do_vfs_ioctl) from [] (SyS_ioctl+0x6c/0x7c) [] (SyS_ioctl) from [] (ret_fast_syscall+0x0/0x1c) Unable to handle kernel NULL pointer dereference at virtual address 00000000 pgd = cd1ec000 [00000000] *pgd=8d1de831, *pte=00000000, *ppte=00000000 Internal error: Oops: 817 [#1] PREEMPT ARM Modules linked in: CPU: 0 PID: 821 Comm: wpa_supplicant Not tainted 4.9.80-dirty #250 Hardware name: Generic AM3517 (Flattened Device Tree) task: ce73eec0 task.stack: cd1ea000 PC is at mutex_lock_nested+0xe8/0x3b0 LR is at mutex_lock_nested+0xd0/0x3b0 Cc: stable@vger.kernel.org Signed-off-by: Sudhir Sreedharan Signed-off-by: Kalle Valo Signed-off-by: Greg Kroah-Hartman --- drivers/net/wireless/realtek/rtl818x/rtl8187/dev.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/wireless/realtek/rtl818x/rtl8187/dev.c b/drivers/net/wireless/realtek/rtl818x/rtl8187/dev.c index b7f72f9c7..b3691712d 100644 --- a/drivers/net/wireless/realtek/rtl818x/rtl8187/dev.c +++ b/drivers/net/wireless/realtek/rtl818x/rtl8187/dev.c @@ -1454,6 +1454,7 @@ static int rtl8187_probe(struct usb_interface *intf, goto err_free_dev; } mutex_init(&priv->io_mutex); + mutex_init(&priv->conf_mutex); SET_IEEE80211_DEV(dev, &intf->dev); usb_set_intfdata(intf, dev); @@ -1627,7 +1628,6 @@ static int rtl8187_probe(struct usb_interface *intf, printk(KERN_ERR "rtl8187: Cannot register device\n"); goto err_free_dmabuf; } - mutex_init(&priv->conf_mutex); skb_queue_head_init(&priv->b_tx_status.queue); wiphy_info(dev->wiphy, "hwaddr %pM, %s V%d + %s, rfkill mask %d\n", From 5cb59b755873a453d5c0d013e67eadffa4551b85 Mon Sep 17 00:00:00 2001 From: Marek Szyprowski Date: Mon, 15 Jan 2018 14:58:21 +0100 Subject: [PATCH 015/146] hwmon: (ina2xx) Fix access to uninitialized mutex commit 0c4c5860e9983eb3da7a3d73ca987643c3ed034b upstream. Initialize data->config_lock mutex before it is used by the driver code. This fixes following warning on Odroid XU3 boards: INFO: trying to register non-static key. the code is fine but needs lockdep annotation. turning off the locking correctness validator. CPU: 5 PID: 1 Comm: swapper/0 Not tainted 4.15.0-rc7-next-20180115-00001-gb75575dee3f2 #107 Hardware name: SAMSUNG EXYNOS (Flattened Device Tree) [] (unwind_backtrace) from [] (show_stack+0x10/0x14) [] (show_stack) from [] (dump_stack+0x90/0xc8) [] (dump_stack) from [] (register_lock_class+0x1c0/0x59c) [] (register_lock_class) from [] (__lock_acquire+0x78/0x1850) [] (__lock_acquire) from [] (lock_acquire+0xc8/0x2b8) [] (lock_acquire) from [] (__mutex_lock+0x60/0xa0c) [] (__mutex_lock) from [] (mutex_lock_nested+0x1c/0x24) [] (mutex_lock_nested) from [] (ina2xx_set_shunt+0x70/0xb0) [] (ina2xx_set_shunt) from [] (ina2xx_probe+0x88/0x1b0) [] (ina2xx_probe) from [] (i2c_device_probe+0x1e0/0x2d0) [] (i2c_device_probe) from [] (driver_probe_device+0x2b8/0x4a0) [] (driver_probe_device) from [] (__driver_attach+0xfc/0x120) [] (__driver_attach) from [] (bus_for_each_dev+0x58/0x7c) [] (bus_for_each_dev) from [] (bus_add_driver+0x174/0x250) [] (bus_add_driver) from [] (driver_register+0x78/0xf4) [] (driver_register) from [] (i2c_register_driver+0x38/0xa8) [] (i2c_register_driver) from [] (do_one_initcall+0x48/0x18c) [] (do_one_initcall) from [] (kernel_init_freeable+0x110/0x1d4) [] (kernel_init_freeable) from [] (kernel_init+0x8/0x114) [] (kernel_init) from [] (ret_from_fork+0x14/0x20) Fixes: 5d389b125186 ("hwmon: (ina2xx) Make calibration register value fixed") Signed-off-by: Marek Szyprowski Signed-off-by: Guenter Roeck [backport to v4.4.y/v4.9.y: context changes] Signed-off-by: Guenter Roeck Signed-off-by: Greg Kroah-Hartman --- drivers/hwmon/ina2xx.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/drivers/hwmon/ina2xx.c b/drivers/hwmon/ina2xx.c index a629f7c13..ac63e5620 100644 --- a/drivers/hwmon/ina2xx.c +++ b/drivers/hwmon/ina2xx.c @@ -447,6 +447,7 @@ static int ina2xx_probe(struct i2c_client *client, /* set the device type */ data->config = &ina2xx_config[id->driver_data]; + mutex_init(&data->config_lock); if (of_property_read_u32(dev->of_node, "shunt-resistor", &val) < 0) { struct ina2xx_platform_data *pdata = dev_get_platdata(dev); @@ -473,8 +474,6 @@ static int ina2xx_probe(struct i2c_client *client, return -ENODEV; } - mutex_init(&data->config_lock); - data->groups[group++] = &ina2xx_group; if (id->driver_data == ina226) data->groups[group++] = &ina226_group; From 596aeb08c610f43d601cf1934357f6527388ba7d Mon Sep 17 00:00:00 2001 From: Bassem Boubaker Date: Wed, 11 Apr 2018 13:15:53 +0200 Subject: [PATCH 016/146] cdc_ether: flag the Cinterion AHS8 modem by gemalto as WWAN [ Upstream commit 53765341ee821c0a0f1dec41adc89c9096ad694c ] The Cinterion AHS8 is a 3G device with one embedded WWAN interface using cdc_ether as a driver. The modem is controlled via AT commands through the exposed TTYs. AT+CGDCONT write command can be used to activate or deactivate a WWAN connection for a PDP context defined with the same command. UE supports one WWAN adapter. Signed-off-by: Bassem Boubaker Acked-by: Oliver Neukum Signed-off-by: David S. Miller Signed-off-by: Greg Kroah-Hartman --- drivers/net/usb/cdc_ether.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/drivers/net/usb/cdc_ether.c b/drivers/net/usb/cdc_ether.c index f9343bee1..6578127db 100644 --- a/drivers/net/usb/cdc_ether.c +++ b/drivers/net/usb/cdc_ether.c @@ -704,6 +704,12 @@ static const struct usb_device_id products[] = { USB_CDC_SUBCLASS_ETHERNET, USB_CDC_PROTO_NONE), .driver_info = (unsigned long)&wwan_info, +}, { + /* Cinterion AHS3 modem by GEMALTO */ + USB_DEVICE_AND_INTERFACE_INFO(0x1e2d, 0x0055, USB_CLASS_COMM, + USB_CDC_SUBCLASS_ETHERNET, + USB_CDC_PROTO_NONE), + .driver_info = (unsigned long)&wwan_info, }, { /* Telit modules */ USB_VENDOR_AND_INTERFACE_INFO(0x1bc7, USB_CLASS_COMM, From 495e6f892c17447204960288a2e2d1e66fe1f12f Mon Sep 17 00:00:00 2001 From: Tejaswi Tanikella Date: Wed, 11 Apr 2018 16:34:47 +0530 Subject: [PATCH 017/146] slip: Check if rstate is initialized before uncompressing [ Upstream commit 3f01ddb962dc506916c243f9524e8bef97119b77 ] On receiving a packet the state index points to the rstate which must be used to fill up IP and TCP headers. But if the state index points to a rstate which is unitialized, i.e. filled with zeros, it gets stuck in an infinite loop inside ip_fast_csum trying to compute the ip checsum of a header with zero length. 89.666953: <2> [] slhc_uncompress+0x464/0x468 89.666965: <2> [] ppp_receive_nonmp_frame+0x3b4/0x65c 89.666978: <2> [] ppp_receive_frame+0x64/0x7e0 89.666991: <2> [] ppp_input+0x104/0x198 89.667005: <2> [] pppopns_recv_core+0x238/0x370 89.667027: <2> [] __sk_receive_skb+0xdc/0x250 89.667040: <2> [] pppopns_recv+0x44/0x60 89.667053: <2> [] __sock_queue_rcv_skb+0x16c/0x24c 89.667065: <2> [] sock_queue_rcv_skb+0x2c/0x38 89.667085: <2> [] raw_rcv+0x124/0x154 89.667098: <2> [] raw_local_deliver+0x1e0/0x22c 89.667117: <2> [] ip_local_deliver_finish+0x70/0x24c 89.667131: <2> [] ip_local_deliver+0x100/0x10c ./scripts/faddr2line vmlinux slhc_uncompress+0x464/0x468 output: ip_fast_csum at arch/arm64/include/asm/checksum.h:40 (inlined by) slhc_uncompress at drivers/net/slip/slhc.c:615 Adding a variable to indicate if the current rstate is initialized. If such a packet arrives, move to toss state. Signed-off-by: Tejaswi Tanikella Signed-off-by: David S. Miller Signed-off-by: Greg Kroah-Hartman --- drivers/net/slip/slhc.c | 5 +++++ include/net/slhc_vj.h | 1 + 2 files changed, 6 insertions(+) diff --git a/drivers/net/slip/slhc.c b/drivers/net/slip/slhc.c index 27ed25252..cfd81eb1b 100644 --- a/drivers/net/slip/slhc.c +++ b/drivers/net/slip/slhc.c @@ -509,6 +509,10 @@ slhc_uncompress(struct slcompress *comp, unsigned char *icp, int isize) if(x < 0 || x > comp->rslot_limit) goto bad; + /* Check if the cstate is initialized */ + if (!comp->rstate[x].initialized) + goto bad; + comp->flags &=~ SLF_TOSS; comp->recv_current = x; } else { @@ -673,6 +677,7 @@ slhc_remember(struct slcompress *comp, unsigned char *icp, int isize) if (cs->cs_tcp.doff > 5) memcpy(cs->cs_tcpopt, icp + ihl*4 + sizeof(struct tcphdr), (cs->cs_tcp.doff - 5) * 4); cs->cs_hsize = ihl*2 + cs->cs_tcp.doff*2; + cs->initialized = true; /* Put headers back on packet * Neither header checksum is recalculated */ diff --git a/include/net/slhc_vj.h b/include/net/slhc_vj.h index 8716d5942..8fcf8908a 100644 --- a/include/net/slhc_vj.h +++ b/include/net/slhc_vj.h @@ -127,6 +127,7 @@ typedef __u32 int32; */ struct cstate { byte_t cs_this; /* connection id number (xmit) */ + bool initialized; /* true if initialized */ struct cstate *next; /* next in ring (xmit) */ struct iphdr cs_ip; /* ip/tcp hdr from most recent packet */ struct tcphdr cs_tcp; From e9aa0570b05a4e802bc8c81ecd547800d980f03a Mon Sep 17 00:00:00 2001 From: Phil Elwell Date: Wed, 11 Apr 2018 10:59:17 +0100 Subject: [PATCH 018/146] lan78xx: Correctly indicate invalid OTP [ Upstream commit 4bfc33807a9a02764bdd1e42e794b3b401240f27 ] lan78xx_read_otp tries to return -EINVAL in the event of invalid OTP content, but the value gets overwritten before it is returned and the read goes ahead anyway. Make the read conditional as it should be and preserve the error code. Fixes: 55d7de9de6c3 ("Microchip's LAN7800 family USB 2/3 to 10/100/1000 Ethernet device driver") Signed-off-by: Phil Elwell Signed-off-by: David S. Miller Signed-off-by: Greg Kroah-Hartman --- drivers/net/usb/lan78xx.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/net/usb/lan78xx.c b/drivers/net/usb/lan78xx.c index ebdee8f01..a6d429950 100644 --- a/drivers/net/usb/lan78xx.c +++ b/drivers/net/usb/lan78xx.c @@ -618,7 +618,8 @@ static int lan78xx_read_otp(struct lan78xx_net *dev, u32 offset, offset += 0x100; else ret = -EINVAL; - ret = lan78xx_read_raw_otp(dev, offset, length, data); + if (!ret) + ret = lan78xx_read_raw_otp(dev, offset, length, data); } return ret; From ae666c1fc6c2f745dda7d785389e16ce0021ef5b Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Mon, 30 May 2016 12:56:27 +0200 Subject: [PATCH 019/146] x86/hweight: Get rid of the special calling convention commit f5967101e9de12addcda4510dfbac66d7c5779c3 upstream. People complained about ARCH_HWEIGHT_CFLAGS and how it throws a wrench into kcov, lto, etc, experimentations. Add asm versions for __sw_hweight{32,64}() and do explicit saving and restoring of clobbered registers. This gets rid of the special calling convention. We get to call those functions on !X86_FEATURE_POPCNT CPUs. We still need to hardcode POPCNT and register operands as some old gas versions which we support, do not know about POPCNT. Btw, remove redundant REX prefix from 32-bit POPCNT because alternatives can do padding now. Suggested-by: H. Peter Anvin Signed-off-by: Borislav Petkov Acked-by: Peter Zijlstra (Intel) Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/1464605787-20603-1-git-send-email-bp@alien8.de Signed-off-by: Ingo Molnar Signed-off-by: Matthias Kaehlcke Signed-off-by: Greg Kroah-Hartman --- arch/x86/Kconfig | 5 -- arch/x86/include/asm/arch_hweight.h | 24 ++++----- arch/x86/kernel/i386_ksyms_32.c | 2 + arch/x86/kernel/x8664_ksyms_64.c | 3 ++ arch/x86/lib/Makefile | 2 +- arch/x86/lib/hweight.S | 77 +++++++++++++++++++++++++++++ lib/Makefile | 2 - lib/hweight.c | 4 ++ 8 files changed, 97 insertions(+), 22 deletions(-) create mode 100644 arch/x86/lib/hweight.S diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 388dfd3bb..cf5be43fc 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -300,11 +300,6 @@ config X86_32_LAZY_GS def_bool y depends on X86_32 && !CC_STACKPROTECTOR -config ARCH_HWEIGHT_CFLAGS - string - default "-fcall-saved-ecx -fcall-saved-edx" if X86_32 - default "-fcall-saved-rdi -fcall-saved-rsi -fcall-saved-rdx -fcall-saved-rcx -fcall-saved-r8 -fcall-saved-r9 -fcall-saved-r10 -fcall-saved-r11" if X86_64 - config ARCH_SUPPORTS_UPROBES def_bool y diff --git a/arch/x86/include/asm/arch_hweight.h b/arch/x86/include/asm/arch_hweight.h index 259a7c1ef..44f825c80 100644 --- a/arch/x86/include/asm/arch_hweight.h +++ b/arch/x86/include/asm/arch_hweight.h @@ -2,8 +2,8 @@ #define _ASM_X86_HWEIGHT_H #ifdef CONFIG_64BIT -/* popcnt %edi, %eax -- redundant REX prefix for alignment */ -#define POPCNT32 ".byte 0xf3,0x40,0x0f,0xb8,0xc7" +/* popcnt %edi, %eax */ +#define POPCNT32 ".byte 0xf3,0x0f,0xb8,0xc7" /* popcnt %rdi, %rax */ #define POPCNT64 ".byte 0xf3,0x48,0x0f,0xb8,0xc7" #define REG_IN "D" @@ -15,19 +15,15 @@ #define REG_OUT "a" #endif -/* - * __sw_hweightXX are called from within the alternatives below - * and callee-clobbered registers need to be taken care of. See - * ARCH_HWEIGHT_CFLAGS in for the respective - * compiler switches. - */ +#define __HAVE_ARCH_SW_HWEIGHT + static __always_inline unsigned int __arch_hweight32(unsigned int w) { - unsigned int res = 0; + unsigned int res; asm (ALTERNATIVE("call __sw_hweight32", POPCNT32, X86_FEATURE_POPCNT) - : "="REG_OUT (res) - : REG_IN (w)); + : "="REG_OUT (res) + : REG_IN (w)); return res; } @@ -51,11 +47,11 @@ static inline unsigned long __arch_hweight64(__u64 w) #else static __always_inline unsigned long __arch_hweight64(__u64 w) { - unsigned long res = 0; + unsigned long res; asm (ALTERNATIVE("call __sw_hweight64", POPCNT64, X86_FEATURE_POPCNT) - : "="REG_OUT (res) - : REG_IN (w)); + : "="REG_OUT (res) + : REG_IN (w)); return res; } diff --git a/arch/x86/kernel/i386_ksyms_32.c b/arch/x86/kernel/i386_ksyms_32.c index 64341aa48..d40ee8a38 100644 --- a/arch/x86/kernel/i386_ksyms_32.c +++ b/arch/x86/kernel/i386_ksyms_32.c @@ -42,3 +42,5 @@ EXPORT_SYMBOL(empty_zero_page); EXPORT_SYMBOL(___preempt_schedule); EXPORT_SYMBOL(___preempt_schedule_notrace); #endif + +EXPORT_SYMBOL(__sw_hweight32); diff --git a/arch/x86/kernel/x8664_ksyms_64.c b/arch/x86/kernel/x8664_ksyms_64.c index a0695be19..c7efd394c 100644 --- a/arch/x86/kernel/x8664_ksyms_64.c +++ b/arch/x86/kernel/x8664_ksyms_64.c @@ -42,6 +42,9 @@ EXPORT_SYMBOL(clear_page); EXPORT_SYMBOL(csum_partial); +EXPORT_SYMBOL(__sw_hweight32); +EXPORT_SYMBOL(__sw_hweight64); + /* * Export string functions. We normally rely on gcc builtin for most of these, * but gcc sometimes decides not to inline them. diff --git a/arch/x86/lib/Makefile b/arch/x86/lib/Makefile index 8ac481870..c8ed431f9 100644 --- a/arch/x86/lib/Makefile +++ b/arch/x86/lib/Makefile @@ -26,7 +26,7 @@ lib-$(CONFIG_RWSEM_XCHGADD_ALGORITHM) += rwsem.o lib-$(CONFIG_INSTRUCTION_DECODER) += insn.o inat.o lib-$(CONFIG_RETPOLINE) += retpoline.o -obj-y += msr.o msr-reg.o msr-reg-export.o +obj-y += msr.o msr-reg.o msr-reg-export.o hweight.o ifeq ($(CONFIG_X86_32),y) obj-y += atomic64_32.o diff --git a/arch/x86/lib/hweight.S b/arch/x86/lib/hweight.S new file mode 100644 index 000000000..02de3d74d --- /dev/null +++ b/arch/x86/lib/hweight.S @@ -0,0 +1,77 @@ +#include + +#include + +/* + * unsigned int __sw_hweight32(unsigned int w) + * %rdi: w + */ +ENTRY(__sw_hweight32) + +#ifdef CONFIG_X86_64 + movl %edi, %eax # w +#endif + __ASM_SIZE(push,) %__ASM_REG(dx) + movl %eax, %edx # w -> t + shrl %edx # t >>= 1 + andl $0x55555555, %edx # t &= 0x55555555 + subl %edx, %eax # w -= t + + movl %eax, %edx # w -> t + shrl $2, %eax # w_tmp >>= 2 + andl $0x33333333, %edx # t &= 0x33333333 + andl $0x33333333, %eax # w_tmp &= 0x33333333 + addl %edx, %eax # w = w_tmp + t + + movl %eax, %edx # w -> t + shrl $4, %edx # t >>= 4 + addl %edx, %eax # w_tmp += t + andl $0x0f0f0f0f, %eax # w_tmp &= 0x0f0f0f0f + imull $0x01010101, %eax, %eax # w_tmp *= 0x01010101 + shrl $24, %eax # w = w_tmp >> 24 + __ASM_SIZE(pop,) %__ASM_REG(dx) + ret +ENDPROC(__sw_hweight32) + +ENTRY(__sw_hweight64) +#ifdef CONFIG_X86_64 + pushq %rdx + + movq %rdi, %rdx # w -> t + movabsq $0x5555555555555555, %rax + shrq %rdx # t >>= 1 + andq %rdx, %rax # t &= 0x5555555555555555 + movabsq $0x3333333333333333, %rdx + subq %rax, %rdi # w -= t + + movq %rdi, %rax # w -> t + shrq $2, %rdi # w_tmp >>= 2 + andq %rdx, %rax # t &= 0x3333333333333333 + andq %rdi, %rdx # w_tmp &= 0x3333333333333333 + addq %rdx, %rax # w = w_tmp + t + + movq %rax, %rdx # w -> t + shrq $4, %rdx # t >>= 4 + addq %rdx, %rax # w_tmp += t + movabsq $0x0f0f0f0f0f0f0f0f, %rdx + andq %rdx, %rax # w_tmp &= 0x0f0f0f0f0f0f0f0f + movabsq $0x0101010101010101, %rdx + imulq %rdx, %rax # w_tmp *= 0x0101010101010101 + shrq $56, %rax # w = w_tmp >> 56 + + popq %rdx + ret +#else /* CONFIG_X86_32 */ + /* We're getting an u64 arg in (%eax,%edx): unsigned long hweight64(__u64 w) */ + pushl %ecx + + call __sw_hweight32 + movl %eax, %ecx # stash away result + movl %edx, %eax # second part of input + call __sw_hweight32 + addl %ecx, %eax # result + + popl %ecx + ret +#endif +ENDPROC(__sw_hweight64) diff --git a/lib/Makefile b/lib/Makefile index 69356970c..308534d0d 100644 --- a/lib/Makefile +++ b/lib/Makefile @@ -72,8 +72,6 @@ obj-$(CONFIG_HAS_IOMEM) += iomap_copy.o devres.o obj-$(CONFIG_CHECK_SIGNATURE) += check_signature.o obj-$(CONFIG_DEBUG_LOCKING_API_SELFTESTS) += locking-selftest.o -GCOV_PROFILE_hweight.o := n -CFLAGS_hweight.o = $(subst $(quote),,$(CONFIG_ARCH_HWEIGHT_CFLAGS)) obj-$(CONFIG_GENERIC_HWEIGHT) += hweight.o obj-$(CONFIG_BTREE) += btree.o diff --git a/lib/hweight.c b/lib/hweight.c index 9a5c1f221..43273a7d8 100644 --- a/lib/hweight.c +++ b/lib/hweight.c @@ -9,6 +9,7 @@ * The Hamming Weight of a number is the total number of bits set in it. */ +#ifndef __HAVE_ARCH_SW_HWEIGHT unsigned int __sw_hweight32(unsigned int w) { #ifdef CONFIG_ARCH_HAS_FAST_MULTIPLIER @@ -25,6 +26,7 @@ unsigned int __sw_hweight32(unsigned int w) #endif } EXPORT_SYMBOL(__sw_hweight32); +#endif unsigned int __sw_hweight16(unsigned int w) { @@ -43,6 +45,7 @@ unsigned int __sw_hweight8(unsigned int w) } EXPORT_SYMBOL(__sw_hweight8); +#ifndef __HAVE_ARCH_SW_HWEIGHT unsigned long __sw_hweight64(__u64 w) { #if BITS_PER_LONG == 32 @@ -65,3 +68,4 @@ unsigned long __sw_hweight64(__u64 w) #endif } EXPORT_SYMBOL(__sw_hweight64); +#endif From cbca61d98fa5330c8ef603241bb339d233d9f10a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ville=20Syrj=C3=A4l=C3=A4?= Date: Mon, 8 Aug 2016 20:35:29 +0300 Subject: [PATCH 020/146] x86/hweight: Don't clobber %rdi MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit commit 65ea11ec6a82b1d44aba62b59e9eb20247e57c6e upstream. The caller expects %rdi to remain intact, push+pop it make that happen. Fixes the following kind of explosions on my core2duo machine when trying to reboot or shut down: general protection fault: 0000 [#1] PREEMPT SMP Modules linked in: i915 i2c_algo_bit drm_kms_helper cfbfillrect syscopyarea cfbimgblt sysfillrect sysimgblt fb_sys_fops cfbcopyarea drm netconsole configfs binfmt_misc iTCO_wdt psmouse pcspkr snd_hda_codec_idt e100 coretemp hwmon snd_hda_codec_generic i2c_i801 mii i2c_smbus lpc_ich mfd_core snd_hda_intel uhci_hcd snd_hda_codec snd_hwdep snd_hda_core ehci_pci 8250 ehci_hcd snd_pcm 8250_base usbcore evdev serial_core usb_common parport_pc parport snd_timer snd soundcore CPU: 0 PID: 3070 Comm: reboot Not tainted 4.8.0-rc1-perf-dirty #69 Hardware name: /D946GZIS, BIOS TS94610J.86A.0087.2007.1107.1049 11/07/2007 task: ffff88012a0b4080 task.stack: ffff880123850000 RIP: 0010:[] [] x86_perf_event_update+0x52/0xc0 RSP: 0018:ffff880123853b60 EFLAGS: 00010087 RAX: 0000000000000001 RBX: ffff88012fc0a3c0 RCX: 000000000000001e RDX: 0000000000000000 RSI: 0000000040000000 RDI: ffff88012b014800 RBP: ffff880123853b88 R08: ffffffffffffffff R09: 0000000000000000 R10: ffffea0004a012c0 R11: ffffea0004acedc0 R12: ffffffff80000001 R13: ffff88012b0149c0 R14: ffff88012b014800 R15: 0000000000000018 FS: 00007f8b155cd700(0000) GS:ffff88012fc00000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 00007f8b155f5000 CR3: 000000012a2d7000 CR4: 00000000000006f0 Stack: ffff88012fc0a3c0 ffff88012b014800 0000000000000004 0000000000000001 ffff88012fc1b750 ffff880123853bb0 ffffffff81003d59 ffff88012b014800 ffff88012fc0a3c0 ffff88012b014800 ffff880123853bd8 ffffffff81003e13 Call Trace: [] x86_pmu_stop+0x59/0xd0 [] x86_pmu_del+0x43/0x140 [] event_sched_out.isra.105+0xbd/0x260 [] __perf_remove_from_context+0x2d/0xb0 [] __perf_event_exit_context+0x4d/0x70 [] generic_exec_single+0xb6/0x140 [] ? __perf_remove_from_context+0xb0/0xb0 [] ? __perf_remove_from_context+0xb0/0xb0 [] smp_call_function_single+0xdf/0x140 [] perf_event_exit_cpu_context+0x87/0xc0 [] perf_reboot+0x13/0x40 [] notifier_call_chain+0x4a/0x70 [] __blocking_notifier_call_chain+0x47/0x60 [] blocking_notifier_call_chain+0x16/0x20 [] kernel_restart_prepare+0x1d/0x40 [] kernel_restart+0x12/0x60 [] SYSC_reboot+0xf6/0x1b0 [] ? mntput_no_expire+0x2c/0x1b0 [] ? mntput+0x24/0x40 [] ? __fput+0x16c/0x1e0 [] ? ____fput+0xe/0x10 [] ? task_work_run+0x83/0xa0 [] ? exit_to_usermode_loop+0x53/0xc0 [] ? trace_hardirqs_on_thunk+0x1a/0x1c [] SyS_reboot+0xe/0x10 [] entry_SYSCALL_64_fastpath+0x18/0xa3 Code: 7c 4c 8d af c0 01 00 00 49 89 fe eb 10 48 09 c2 4c 89 e0 49 0f b1 55 00 4c 39 e0 74 35 4d 8b a6 c0 01 00 00 41 8b 8e 60 01 00 00 <0f> 33 8b 35 6e 02 8c 00 48 c1 e2 20 85 f6 7e d2 48 89 d3 89 cf RIP [] x86_perf_event_update+0x52/0xc0 RSP ---[ end trace 7ec95181faf211be ]--- note: reboot[3070] exited with preempt_count 2 Cc: Borislav Petkov Cc: H. Peter Anvin Cc: Andy Lutomirski Cc: Brian Gerst Cc: Denys Vlasenko Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Ingo Molnar Fixes: f5967101e9de ("x86/hweight: Get rid of the special calling convention") Signed-off-by: Ville Syrjälä Signed-off-by: Linus Torvalds Cc: Matthias Kaehlcke Signed-off-by: Greg Kroah-Hartman --- arch/x86/lib/hweight.S | 2 ++ 1 file changed, 2 insertions(+) diff --git a/arch/x86/lib/hweight.S b/arch/x86/lib/hweight.S index 02de3d74d..8a602a1e4 100644 --- a/arch/x86/lib/hweight.S +++ b/arch/x86/lib/hweight.S @@ -35,6 +35,7 @@ ENDPROC(__sw_hweight32) ENTRY(__sw_hweight64) #ifdef CONFIG_X86_64 + pushq %rdi pushq %rdx movq %rdi, %rdx # w -> t @@ -60,6 +61,7 @@ ENTRY(__sw_hweight64) shrq $56, %rax # w = w_tmp >> 56 popq %rdx + popq %rdi ret #else /* CONFIG_X86_32 */ /* We're getting an u64 arg in (%eax,%edx): unsigned long hweight64(__u64 w) */ From cf5b4c9e091a8cf16aa628215ed642ac7507a116 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 13 Feb 2018 07:38:08 -0800 Subject: [PATCH 021/146] tty: make n_tty_read() always abort if hangup is in progress commit 28b0f8a6962a24ed21737578f3b1b07424635c9e upstream. A tty is hung up by __tty_hangup() setting file->f_op to hung_up_tty_fops, which is skipped on ttys whose write operation isn't tty_write(). This means that, for example, /dev/console whose write op is redirected_tty_write() is never actually marked hung up. Because n_tty_read() uses the hung up status to decide whether to abort the waiting readers, the lack of hung-up marking can lead to the following scenario. 1. A session contains two processes. The leader and its child. The child ignores SIGHUP. 2. The leader exits and starts disassociating from the controlling terminal (/dev/console). 3. __tty_hangup() skips setting f_op to hung_up_tty_fops. 4. SIGHUP is delivered and ignored. 5. tty_ldisc_hangup() is invoked. It wakes up the waits which should clear the read lockers of tty->ldisc_sem. 6. The reader wakes up but because tty_hung_up_p() is false, it doesn't abort and goes back to sleep while read-holding tty->ldisc_sem. 7. The leader progresses to tty_ldisc_lock() in tty_ldisc_hangup() and is now stuck in D sleep indefinitely waiting for tty->ldisc_sem. The following is Alan's explanation on why some ttys aren't hung up. http://lkml.kernel.org/r/20171101170908.6ad08580@alans-desktop 1. It broke the serial consoles because they would hang up and close down the hardware. With tty_port that *should* be fixable properly for any cases remaining. 2. The console layer was (and still is) completely broken and doens't refcount properly. So if you turn on console hangups it breaks (as indeed does freeing consoles and half a dozen other things). As neither can be fixed quickly, this patch works around the problem by introducing a new flag, TTY_HUPPING, which is used solely to tell n_tty_read() that hang-up is in progress for the console and the readers should be aborted regardless of the hung-up status of the device. The following is a sample hung task warning caused by this issue. INFO: task agetty:2662 blocked for more than 120 seconds. Not tainted 4.11.3-dbg-tty-lockup-02478-gfd6c7ee-dirty #28 "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message. 0 2662 1 0x00000086 Call Trace: __schedule+0x267/0x890 schedule+0x36/0x80 schedule_timeout+0x23c/0x2e0 ldsem_down_write+0xce/0x1f6 tty_ldisc_lock+0x16/0x30 tty_ldisc_hangup+0xb3/0x1b0 __tty_hangup+0x300/0x410 disassociate_ctty+0x6c/0x290 do_exit+0x7ef/0xb00 do_group_exit+0x3f/0xa0 get_signal+0x1b3/0x5d0 do_signal+0x28/0x660 exit_to_usermode_loop+0x46/0x86 do_syscall_64+0x9c/0xb0 entry_SYSCALL64_slow_path+0x25/0x25 The following is the repro. Run "$PROG /dev/console". The parent process hangs in D state. #include #include #include #include #include #include #include #include #include #include #include #include int main(int argc, char **argv) { struct sigaction sact = { .sa_handler = SIG_IGN }; struct timespec ts1s = { .tv_sec = 1 }; pid_t pid; int fd; if (argc < 2) { fprintf(stderr, "test-hung-tty /dev/$TTY\n"); return 1; } /* fork a child to ensure that it isn't already the session leader */ pid = fork(); if (pid < 0) { perror("fork"); return 1; } if (pid > 0) { /* top parent, wait for everyone */ while (waitpid(-1, NULL, 0) >= 0) ; if (errno != ECHILD) perror("waitpid"); return 0; } /* new session, start a new session and set the controlling tty */ if (setsid() < 0) { perror("setsid"); return 1; } fd = open(argv[1], O_RDWR); if (fd < 0) { perror("open"); return 1; } if (ioctl(fd, TIOCSCTTY, 1) < 0) { perror("ioctl"); return 1; } /* fork a child, sleep a bit and exit */ pid = fork(); if (pid < 0) { perror("fork"); return 1; } if (pid > 0) { nanosleep(&ts1s, NULL); printf("Session leader exiting\n"); exit(0); } /* * The child ignores SIGHUP and keeps reading from the controlling * tty. Because SIGHUP is ignored, the child doesn't get killed on * parent exit and the bug in n_tty makes the read(2) block the * parent's control terminal hangup attempt. The parent ends up in * D sleep until the child is explicitly killed. */ sigaction(SIGHUP, &sact, NULL); printf("Child reading tty\n"); while (1) { char buf[1024]; if (read(fd, buf, sizeof(buf)) < 0) { perror("read"); return 1; } } return 0; } Signed-off-by: Tejun Heo Cc: Alan Cox Cc: stable@vger.kernel.org Signed-off-by: Greg Kroah-Hartman --- drivers/tty/n_tty.c | 6 ++++++ drivers/tty/tty_io.c | 9 +++++++++ include/linux/tty.h | 1 + 3 files changed, 16 insertions(+) diff --git a/drivers/tty/n_tty.c b/drivers/tty/n_tty.c index 41dda25da..190e5dc15 100644 --- a/drivers/tty/n_tty.c +++ b/drivers/tty/n_tty.c @@ -2238,6 +2238,12 @@ static ssize_t n_tty_read(struct tty_struct *tty, struct file *file, } if (tty_hung_up_p(file)) break; + /* + * Abort readers for ttys which never actually + * get hung up. See __tty_hangup(). + */ + if (test_bit(TTY_HUPPING, &tty->flags)) + break; if (!timeout) break; if (file->f_flags & O_NONBLOCK) { diff --git a/drivers/tty/tty_io.c b/drivers/tty/tty_io.c index a638c1738..89fd20382 100644 --- a/drivers/tty/tty_io.c +++ b/drivers/tty/tty_io.c @@ -702,6 +702,14 @@ static void __tty_hangup(struct tty_struct *tty, int exit_session) return; } + /* + * Some console devices aren't actually hung up for technical and + * historical reasons, which can lead to indefinite interruptible + * sleep in n_tty_read(). The following explicitly tells + * n_tty_read() to abort readers. + */ + set_bit(TTY_HUPPING, &tty->flags); + /* inuse_filps is protected by the single tty lock, this really needs to change if we want to flush the workqueue with the lock held */ @@ -757,6 +765,7 @@ static void __tty_hangup(struct tty_struct *tty, int exit_session) * can't yet guarantee all that. */ set_bit(TTY_HUPPED, &tty->flags); + clear_bit(TTY_HUPPING, &tty->flags); tty_unlock(tty); if (f) diff --git a/include/linux/tty.h b/include/linux/tty.h index a1042afff..d67ceb3f5 100644 --- a/include/linux/tty.h +++ b/include/linux/tty.h @@ -342,6 +342,7 @@ struct tty_file_private { #define TTY_PTY_LOCK 16 /* pty private */ #define TTY_NO_WRITE_SPLIT 17 /* Preserve write boundaries to driver */ #define TTY_HUPPED 18 /* Post driver->hangup() */ +#define TTY_HUPPING 19 /* Hangup in progress */ #define TTY_LDISC_HALTED 22 /* Line discipline is halted */ #define TTY_WRITE_FLUSH(tty) tty_write_flush((tty)) From 8935dd04707ee6cbe4d1ad37621db92a252230df Mon Sep 17 00:00:00 2001 From: Richard Weinberger Date: Wed, 17 Jan 2018 19:12:42 +0100 Subject: [PATCH 022/146] ubifs: Check ubifs_wbuf_sync() return code commit aac17948a7ce01fb60b9ee6cf902967a47b3ce26 upstream. If ubifs_wbuf_sync() fails we must not write a master node with the dirty marker cleared. Otherwise it is possible that in case of an IO error while syncing we mark the filesystem as clean and UBIFS refuses to recover upon next mount. Cc: Fixes: 1e51764a3c2a ("UBIFS: add new flash file system") Signed-off-by: Richard Weinberger Signed-off-by: Greg Kroah-Hartman --- fs/ubifs/super.c | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/fs/ubifs/super.c b/fs/ubifs/super.c index 1fd90c079..0bb6de356 100644 --- a/fs/ubifs/super.c +++ b/fs/ubifs/super.c @@ -1728,8 +1728,11 @@ static void ubifs_remount_ro(struct ubifs_info *c) dbg_save_space_info(c); - for (i = 0; i < c->jhead_cnt; i++) - ubifs_wbuf_sync(&c->jheads[i].wbuf); + for (i = 0; i < c->jhead_cnt; i++) { + err = ubifs_wbuf_sync(&c->jheads[i].wbuf); + if (err) + ubifs_ro_mode(c, err); + } c->mst_node->flags &= ~cpu_to_le32(UBIFS_MST_DIRTY); c->mst_node->flags |= cpu_to_le32(UBIFS_MST_NO_ORPHS); @@ -1795,8 +1798,11 @@ static void ubifs_put_super(struct super_block *sb) int err; /* Synchronize write-buffers */ - for (i = 0; i < c->jhead_cnt; i++) - ubifs_wbuf_sync(&c->jheads[i].wbuf); + for (i = 0; i < c->jhead_cnt; i++) { + err = ubifs_wbuf_sync(&c->jheads[i].wbuf); + if (err) + ubifs_ro_mode(c, err); + } /* * We are being cleanly unmounted which means the From 45544420efce667850dca4a89e5e300e9a127e4f Mon Sep 17 00:00:00 2001 From: Richard Weinberger Date: Wed, 17 Jan 2018 23:15:57 +0100 Subject: [PATCH 023/146] ubi: fastmap: Don't flush fastmap work on detach commit 29b7a6fa1ec07e8480b0d9caf635a4498a438bf4 upstream. At this point UBI volumes have already been free()'ed and fastmap can no longer access these data structures. Reported-by: Martin Townsend Fixes: 74cdaf24004a ("UBI: Fastmap: Fix memory leaks while closing the WL sub-system") Cc: stable@vger.kernel.org Signed-off-by: Richard Weinberger Signed-off-by: Greg Kroah-Hartman --- drivers/mtd/ubi/fastmap-wl.c | 1 - 1 file changed, 1 deletion(-) diff --git a/drivers/mtd/ubi/fastmap-wl.c b/drivers/mtd/ubi/fastmap-wl.c index 30d3999dd..ed62f1efe 100644 --- a/drivers/mtd/ubi/fastmap-wl.c +++ b/drivers/mtd/ubi/fastmap-wl.c @@ -360,7 +360,6 @@ static void ubi_fastmap_close(struct ubi_device *ubi) { int i; - flush_work(&ubi->fm_work); return_unused_pool_pebs(ubi, &ubi->fm_pool); return_unused_pool_pebs(ubi, &ubi->fm_wl_pool); From 41854494d203cde51b574e16aec2995a79ab3625 Mon Sep 17 00:00:00 2001 From: Romain Izard Date: Mon, 29 Jan 2018 11:18:20 +0100 Subject: [PATCH 024/146] ubi: Fix error for write access commit 78a8dfbabbece22bee58ac4cb26cab10e7a19c5d upstream. When opening a device with write access, ubiblock_open returns an error code. Currently, this error code is -EPERM, but this is not the right value. The open function for other block devices returns -EROFS when opening read-only devices with FMODE_WRITE set. When used with dm-verity, the veritysetup userspace tool is expecting EROFS, and refuses to use the ubiblock device. Use -EROFS for ubiblock as well. As a result, veritysetup accepts the ubiblock device as valid. Cc: stable@vger.kernel.org Fixes: 9d54c8a33eec (UBI: R/O block driver on top of UBI volumes) Signed-off-by: Romain Izard Signed-off-by: Richard Weinberger Signed-off-by: Greg Kroah-Hartman --- drivers/mtd/ubi/block.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/mtd/ubi/block.c b/drivers/mtd/ubi/block.c index b2fb0528c..07ad86759 100644 --- a/drivers/mtd/ubi/block.c +++ b/drivers/mtd/ubi/block.c @@ -244,7 +244,7 @@ static int ubiblock_open(struct block_device *bdev, fmode_t mode) * in any case. */ if (mode & FMODE_WRITE) { - ret = -EPERM; + ret = -EROFS; goto out_unlock; } From e814ac34e1f11f0b5130ccbffd2e46349f40f845 Mon Sep 17 00:00:00 2001 From: Richard Weinberger Date: Sat, 3 Mar 2018 11:45:54 +0100 Subject: [PATCH 025/146] ubi: Reject MLC NAND commit b5094b7f135be34630e3ea8a98fa215715d0f29d upstream. While UBI and UBIFS seem to work at first sight with MLC NAND, you will most likely lose all your data upon a power-cut or due to read/write disturb. In order to protect users from bad surprises, refuse to attach to MLC NAND. Cc: stable@vger.kernel.org Signed-off-by: Richard Weinberger Acked-by: Boris Brezillon Acked-by: Artem Bityutskiy Signed-off-by: Greg Kroah-Hartman --- drivers/mtd/ubi/build.c | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/drivers/mtd/ubi/build.c b/drivers/mtd/ubi/build.c index 27de04632..a2e6c7848 100644 --- a/drivers/mtd/ubi/build.c +++ b/drivers/mtd/ubi/build.c @@ -889,6 +889,17 @@ int ubi_attach_mtd_dev(struct mtd_info *mtd, int ubi_num, return -EINVAL; } + /* + * Both UBI and UBIFS have been designed for SLC NAND and NOR flashes. + * MLC NAND is different and needs special care, otherwise UBI or UBIFS + * will die soon and you will lose all your data. + */ + if (mtd->type == MTD_MLCNANDFLASH) { + pr_err("ubi: refuse attaching mtd%d - MLC NAND is not supported\n", + mtd->index); + return -EINVAL; + } + if (ubi_num == UBI_DEV_NUM_AUTO) { /* Search for an empty slot in the @ubi_devices array */ for (ubi_num = 0; ubi_num < UBI_MAX_DEVICES; ubi_num++) From d06ac74c30d8cc007dc6d9d941495ace4a6bd8e0 Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Tue, 10 Apr 2018 16:34:41 -0700 Subject: [PATCH 026/146] fs/reiserfs/journal.c: add missing resierfs_warning() arg commit 9ad553abe66f8be3f4755e9fa0a6ba137ce76341 upstream. One use of the reiserfs_warning() macro in journal_init_dev() is missing a parameter, causing the following warning: REISERFS warning (device loop0): journal_init_dev: Cannot open '%s': %i journal_init_dev: This also causes a WARN_ONCE() warning in the vsprintf code, and then a panic if panic_on_warn is set. Please remove unsupported %/ in format string WARNING: CPU: 1 PID: 4480 at lib/vsprintf.c:2138 format_decode+0x77f/0x830 lib/vsprintf.c:2138 Kernel panic - not syncing: panic_on_warn set ... Just add another string argument to the macro invocation. Addresses https://syzkaller.appspot.com/bug?id=0627d4551fdc39bf1ef5d82cd9eef587047f7718 Link: http://lkml.kernel.org/r/d678ebe1-6f54-8090-df4c-b9affad62293@infradead.org Signed-off-by: Randy Dunlap Reported-by: Tested-by: Randy Dunlap Acked-by: Jeff Mahoney Cc: Alexander Viro Cc: Jan Kara Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds Signed-off-by: Greg Kroah-Hartman --- fs/reiserfs/journal.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/reiserfs/journal.c b/fs/reiserfs/journal.c index a72097b62..00985f9db 100644 --- a/fs/reiserfs/journal.c +++ b/fs/reiserfs/journal.c @@ -2643,7 +2643,7 @@ static int journal_init_dev(struct super_block *super, if (IS_ERR(journal->j_dev_bd)) { result = PTR_ERR(journal->j_dev_bd); journal->j_dev_bd = NULL; - reiserfs_warning(super, + reiserfs_warning(super, "sh-457", "journal_init_dev: Cannot open '%s': %i", jdev_name, result); return result; From ff9194f304d6700e5ec302491e7a0c03f134c1e8 Mon Sep 17 00:00:00 2001 From: Takashi Iwai Date: Fri, 13 Apr 2018 15:35:13 -0700 Subject: [PATCH 027/146] resource: fix integer overflow at reallocation commit 60bb83b81169820c691fbfa33a6a4aef32aa4b0b upstream. We've got a bug report indicating a kernel panic at booting on an x86-32 system, and it turned out to be the invalid PCI resource assigned after reallocation. __find_resource() first aligns the resource start address and resets the end address with start+size-1 accordingly, then checks whether it's contained. Here the end address may overflow the integer, although resource_contains() still returns true because the function validates only start and end address. So this ends up with returning an invalid resource (start > end). There was already an attempt to cover such a problem in the commit 47ea91b4052d ("Resource: fix wrong resource window calculation"), but this case is an overseen one. This patch adds the validity check of the newly calculated resource for avoiding the integer overflow problem. Bugzilla: http://bugzilla.opensuse.org/show_bug.cgi?id=1086739 Link: http://lkml.kernel.org/r/s5hpo37d5l8.wl-tiwai@suse.de Fixes: 23c570a67448 ("resource: ability to resize an allocated resource") Signed-off-by: Takashi Iwai Reported-by: Michael Henders Tested-by: Michael Henders Reviewed-by: Andrew Morton Cc: Ram Pai Cc: Bjorn Helgaas Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds Signed-off-by: Greg Kroah-Hartman --- kernel/resource.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/kernel/resource.c b/kernel/resource.c index c09d484f7..73348f574 100644 --- a/kernel/resource.c +++ b/kernel/resource.c @@ -611,7 +611,8 @@ static int __find_resource(struct resource *root, struct resource *old, alloc.start = constraint->alignf(constraint->alignf_data, &avail, size, constraint->align); alloc.end = alloc.start + size - 1; - if (resource_contains(&avail, &alloc)) { + if (alloc.start <= alloc.end && + resource_contains(&avail, &alloc)) { new->start = alloc.start; new->end = alloc.end; return 0; From 80a6e1fdff934a006fd9f9168e132b8ff2a479ff Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Fri, 13 Apr 2018 15:35:30 -0700 Subject: [PATCH 028/146] ipc/shm: fix use-after-free of shm file via remap_file_pages() commit 3f05317d9889ab75c7190dcd39491d2a97921984 upstream. syzbot reported a use-after-free of shm_file_data(file)->file->f_op in shm_get_unmapped_area(), called via sys_remap_file_pages(). Unfortunately it couldn't generate a reproducer, but I found a bug which I think caused it. When remap_file_pages() is passed a full System V shared memory segment, the memory is first unmapped, then a new map is created using the ->vm_file. Between these steps, the shm ID can be removed and reused for a new shm segment. But, shm_mmap() only checks whether the ID is currently valid before calling the underlying file's ->mmap(); it doesn't check whether it was reused. Thus it can use the wrong underlying file, one that was already freed. Fix this by making the "outer" shm file (the one that gets put in ->vm_file) hold a reference to the real shm file, and by making __shm_open() require that the file associated with the shm ID matches the one associated with the "outer" file. Taking the reference to the real shm file is needed to fully solve the problem, since otherwise sfd->file could point to a freed file, which then could be reallocated for the reused shm ID, causing the wrong shm segment to be mapped (and without the required permission checks). Commit 1ac0b6dec656 ("ipc/shm: handle removed segments gracefully in shm_mmap()") almost fixed this bug, but it didn't go far enough because it didn't consider the case where the shm ID is reused. The following program usually reproduces this bug: #include #include #include #include int main() { int is_parent = (fork() != 0); srand(getpid()); for (;;) { int id = shmget(0xF00F, 4096, IPC_CREAT|0700); if (is_parent) { void *addr = shmat(id, NULL, 0); usleep(rand() % 50); while (!syscall(__NR_remap_file_pages, addr, 4096, 0, 0, 0)); } else { usleep(rand() % 50); shmctl(id, IPC_RMID, NULL); } } } It causes the following NULL pointer dereference due to a 'struct file' being used while it's being freed. (I couldn't actually get a KASAN use-after-free splat like in the syzbot report. But I think it's possible with this bug; it would just take a more extraordinary race...) BUG: unable to handle kernel NULL pointer dereference at 0000000000000058 PGD 0 P4D 0 Oops: 0000 [#1] SMP NOPTI CPU: 9 PID: 258 Comm: syz_ipc Not tainted 4.16.0-05140-gf8cf2f16a7c95 #189 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.11.0-20171110_100015-anatol 04/01/2014 RIP: 0010:d_inode include/linux/dcache.h:519 [inline] RIP: 0010:touch_atime+0x25/0xd0 fs/inode.c:1724 [...] Call Trace: file_accessed include/linux/fs.h:2063 [inline] shmem_mmap+0x25/0x40 mm/shmem.c:2149 call_mmap include/linux/fs.h:1789 [inline] shm_mmap+0x34/0x80 ipc/shm.c:465 call_mmap include/linux/fs.h:1789 [inline] mmap_region+0x309/0x5b0 mm/mmap.c:1712 do_mmap+0x294/0x4a0 mm/mmap.c:1483 do_mmap_pgoff include/linux/mm.h:2235 [inline] SYSC_remap_file_pages mm/mmap.c:2853 [inline] SyS_remap_file_pages+0x232/0x310 mm/mmap.c:2769 do_syscall_64+0x64/0x1a0 arch/x86/entry/common.c:287 entry_SYSCALL_64_after_hwframe+0x42/0xb7 [ebiggers@google.com: add comment] Link: http://lkml.kernel.org/r/20180410192850.235835-1-ebiggers3@gmail.com Link: http://lkml.kernel.org/r/20180409043039.28915-1-ebiggers3@gmail.com Reported-by: syzbot+d11f321e7f1923157eac80aa990b446596f46439@syzkaller.appspotmail.com Fixes: c8d78c1823f4 ("mm: replace remap_file_pages() syscall with emulation") Signed-off-by: Eric Biggers Acked-by: Kirill A. Shutemov Acked-by: Davidlohr Bueso Cc: Manfred Spraul Cc: "Eric W . Biederman" Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds Signed-off-by: Greg Kroah-Hartman --- ipc/shm.c | 23 ++++++++++++++++++++--- 1 file changed, 20 insertions(+), 3 deletions(-) diff --git a/ipc/shm.c b/ipc/shm.c index 4982a4e7f..a492dd81c 100644 --- a/ipc/shm.c +++ b/ipc/shm.c @@ -198,6 +198,12 @@ static int __shm_open(struct vm_area_struct *vma) if (IS_ERR(shp)) return PTR_ERR(shp); + if (shp->shm_file != sfd->file) { + /* ID was reused */ + shm_unlock(shp); + return -EINVAL; + } + shp->shm_atim = get_seconds(); shp->shm_lprid = task_tgid_vnr(current); shp->shm_nattch++; @@ -414,8 +420,9 @@ static int shm_mmap(struct file *file, struct vm_area_struct *vma) int ret; /* - * In case of remap_file_pages() emulation, the file can represent - * removed IPC ID: propogate shm_lock() error to caller. + * In case of remap_file_pages() emulation, the file can represent an + * IPC ID that was removed, and possibly even reused by another shm + * segment already. Propagate this case as an error to caller. */ ret =__shm_open(vma); if (ret) @@ -439,6 +446,7 @@ static int shm_release(struct inode *ino, struct file *file) struct shm_file_data *sfd = shm_file_data(file); put_ipc_ns(sfd->ns); + fput(sfd->file); shm_file_data(file) = NULL; kfree(sfd); return 0; @@ -1198,7 +1206,16 @@ long do_shmat(int shmid, char __user *shmaddr, int shmflg, file->f_mapping = shp->shm_file->f_mapping; sfd->id = shp->shm_perm.id; sfd->ns = get_ipc_ns(ns); - sfd->file = shp->shm_file; + /* + * We need to take a reference to the real shm file to prevent the + * pointer from becoming stale in cases where the lifetime of the outer + * file extends beyond that of the shm segment. It's not usually + * possible, but it can happen during remap_file_pages() emulation as + * that unmaps the memory, then does ->mmap() via file reference only. + * We'll deny the ->mmap() if the shm segment was since removed, but to + * detect shm ID reuse we need to compare the file pointers. + */ + sfd->file = get_file(shp->shm_file); sfd->vm_ops = NULL; err = security_mmap_file(file, prot, flags); From 2ed9533d07a1d54fde7a35e83628ddd29c3cec3b Mon Sep 17 00:00:00 2001 From: Vlastimil Babka Date: Fri, 13 Apr 2018 15:35:38 -0700 Subject: [PATCH 029/146] mm, slab: reschedule cache_reap() on the same CPU commit a9f2a846f0503e7d729f552e3ccfe2279010fe94 upstream. cache_reap() is initially scheduled in start_cpu_timer() via schedule_delayed_work_on(). But then the next iterations are scheduled via schedule_delayed_work(), i.e. using WORK_CPU_UNBOUND. Thus since commit ef557180447f ("workqueue: schedule WORK_CPU_UNBOUND work on wq_unbound_cpumask CPUs") there is no guarantee the future iterations will run on the originally intended cpu, although it's still preferred. I was able to demonstrate this with /sys/module/workqueue/parameters/debug_force_rr_cpu. IIUC, it may also happen due to migrating timers in nohz context. As a result, some cpu's would be calling cache_reap() more frequently and others never. This patch uses schedule_delayed_work_on() with the current cpu when scheduling the next iteration. Link: http://lkml.kernel.org/r/20180411070007.32225-1-vbabka@suse.cz Fixes: ef557180447f ("workqueue: schedule WORK_CPU_UNBOUND work on wq_unbound_cpumask CPUs") Signed-off-by: Vlastimil Babka Acked-by: Pekka Enberg Acked-by: Christoph Lameter Cc: Joonsoo Kim Cc: David Rientjes Cc: Tejun Heo Cc: Lai Jiangshan Cc: John Stultz Cc: Thomas Gleixner Cc: Stephen Boyd Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds Signed-off-by: Greg Kroah-Hartman --- mm/slab.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/mm/slab.c b/mm/slab.c index 7a5b5dd3f..6b8db2ae0 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -3922,7 +3922,8 @@ static void cache_reap(struct work_struct *w) next_reap_node(); out: /* Set up the next iteration */ - schedule_delayed_work(work, round_jiffies_relative(REAPTIMEOUT_AC)); + schedule_delayed_work_on(smp_processor_id(), work, + round_jiffies_relative(REAPTIMEOUT_AC)); } #ifdef CONFIG_SLABINFO From ea37c2715a34e2afb2b4c1077094e6b6205b3caa Mon Sep 17 00:00:00 2001 From: Heinrich Schuchardt Date: Thu, 29 Mar 2018 10:48:28 -0500 Subject: [PATCH 030/146] usb: musb: gadget: misplaced out of bounds check commit af6f8529098aeb0e56a68671b450cf74e7a64fcd upstream. musb->endpoints[] has array size MUSB_C_NUM_EPS. We must check array bounds before accessing the array and not afterwards. Signed-off-by: Heinrich Schuchardt Signed-off-by: Bin Liu Cc: stable Signed-off-by: Greg Kroah-Hartman --- drivers/usb/musb/musb_gadget_ep0.c | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/drivers/usb/musb/musb_gadget_ep0.c b/drivers/usb/musb/musb_gadget_ep0.c index 10d30afe4..a0d141736 100644 --- a/drivers/usb/musb/musb_gadget_ep0.c +++ b/drivers/usb/musb/musb_gadget_ep0.c @@ -114,15 +114,19 @@ static int service_tx_status_request( } is_in = epnum & USB_DIR_IN; - if (is_in) { - epnum &= 0x0f; + epnum &= 0x0f; + if (epnum >= MUSB_C_NUM_EPS) { + handled = -EINVAL; + break; + } + + if (is_in) ep = &musb->endpoints[epnum].ep_in; - } else { + else ep = &musb->endpoints[epnum].ep_out; - } regs = musb->endpoints[epnum].regs; - if (epnum >= MUSB_C_NUM_EPS || !ep->desc) { + if (!ep->desc) { handled = -EINVAL; break; } From ecd76e9c71e108a24e078cdf4542bd5a8c0e6228 Mon Sep 17 00:00:00 2001 From: Nicolas Ferre Date: Tue, 13 Mar 2018 16:20:05 +0100 Subject: [PATCH 031/146] ARM: dts: at91: at91sam9g25: fix mux-mask pinctrl property commit e8fd0adf105e132fd84545997bbef3d5edc2c9c1 upstream. There are only 19 PIOB pins having primary names PB0-PB18. Not all of them have a 'C' function. So the pinctrl property mask ends up being the same as the other SoC of the at91sam9x5 series. Reported-by: Marek Sieranski Signed-off-by: Nicolas Ferre Cc: # v3.8+ Signed-off-by: Alexandre Belloni Signed-off-by: Greg Kroah-Hartman --- arch/arm/boot/dts/at91sam9g25.dtsi | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/arm/boot/dts/at91sam9g25.dtsi b/arch/arm/boot/dts/at91sam9g25.dtsi index a7da0dd0c..0898213f3 100644 --- a/arch/arm/boot/dts/at91sam9g25.dtsi +++ b/arch/arm/boot/dts/at91sam9g25.dtsi @@ -21,7 +21,7 @@ atmel,mux-mask = < /* A B C */ 0xffffffff 0xffe0399f 0xc000001c /* pioA */ - 0x0007ffff 0x8000fe3f 0x00000000 /* pioB */ + 0x0007ffff 0x00047e3f 0x00000000 /* pioB */ 0x80000000 0x07c0ffff 0xb83fffff /* pioC */ 0x003fffff 0x003f8000 0x00000000 /* pioD */ >; From 1276dc3da047255650a1c40b88ef7b68fe1fef86 Mon Sep 17 00:00:00 2001 From: Santiago Esteban Date: Thu, 18 Jan 2018 15:38:47 +0100 Subject: [PATCH 032/146] ARM: dts: at91: sama5d4: fix pinctrl compatible string commit 9a06757dcc8509c162ac00488c8c82fc98e04227 upstream. The compatible string is incorrect. Add atmel,sama5d3-pinctrl since it's the appropriate compatible string. Remove the atmel,at91rm9200-pinctrl compatible string, this fallback is useless, there are too many changes. Signed-off-by: Santiago Esteban Signed-off-by: Ludovic Desroches Cc: stable@vger.kernel.org #v3.18 Signed-off-by: Alexandre Belloni Signed-off-by: Greg Kroah-Hartman --- arch/arm/boot/dts/sama5d4.dtsi | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/arm/boot/dts/sama5d4.dtsi b/arch/arm/boot/dts/sama5d4.dtsi index 3daf8d5d7..fb0d1b252 100644 --- a/arch/arm/boot/dts/sama5d4.dtsi +++ b/arch/arm/boot/dts/sama5d4.dtsi @@ -1354,7 +1354,7 @@ pinctrl@fc06a000 { #address-cells = <1>; #size-cells = <1>; - compatible = "atmel,at91sam9x5-pinctrl", "atmel,at91rm9200-pinctrl", "simple-bus"; + compatible = "atmel,sama5d3-pinctrl", "atmel,at91sam9x5-pinctrl", "simple-bus"; ranges = <0xfc068000 0xfc068000 0x100 0xfc06a000 0xfc06a000 0x4000>; /* WARNING: revisit as pin spec has changed */ From 7335e813c69d97232948cd111cd3c71a0db4b42b Mon Sep 17 00:00:00 2001 From: Jason Andryuk Date: Wed, 28 Feb 2018 07:23:23 -0500 Subject: [PATCH 033/146] xen-netfront: Fix hang on device removal commit c2d2e6738a209f0f9dffa2dc8e7292fc45360d61 upstream. A toolstack may delete the vif frontend and backend xenstore entries while xen-netfront is in the removal code path. In that case, the checks for xenbus_read_driver_state would return XenbusStateUnknown, and xennet_remove would hang indefinitely. This hang prevents system shutdown. xennet_remove must be able to handle XenbusStateUnknown, and netback_changed must also wake up the wake_queue for that state as well. Fixes: 5b5971df3bc2 ("xen-netfront: remove warning when unloading module") Signed-off-by: Jason Andryuk Cc: Eduardo Otubo Reviewed-by: Boris Ostrovsky Signed-off-by: Juergen Gross Signed-off-by: Greg Kroah-Hartman --- drivers/net/xen-netfront.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/drivers/net/xen-netfront.c b/drivers/net/xen-netfront.c index 0b8d26559..fee4c01fb 100644 --- a/drivers/net/xen-netfront.c +++ b/drivers/net/xen-netfront.c @@ -2024,7 +2024,10 @@ static void netback_changed(struct xenbus_device *dev, case XenbusStateInitialised: case XenbusStateReconfiguring: case XenbusStateReconfigured: + break; + case XenbusStateUnknown: + wake_up_all(&module_unload_q); break; case XenbusStateInitWait: @@ -2155,7 +2158,9 @@ static int xennet_remove(struct xenbus_device *dev) xenbus_switch_state(dev, XenbusStateClosing); wait_event(module_unload_q, xenbus_read_driver_state(dev->otherend) == - XenbusStateClosing); + XenbusStateClosing || + xenbus_read_driver_state(dev->otherend) == + XenbusStateUnknown); xenbus_switch_state(dev, XenbusStateClosed); wait_event(module_unload_q, From 398b633e81bd90f26e6c925fa823420ebfb609cd Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Thu, 8 Feb 2018 10:23:44 +0300 Subject: [PATCH 034/146] regmap: Fix reversed bounds check in regmap_raw_write() commit f00e71091ab92eba52122332586c6ecaa9cd1a56 upstream. We're supposed to be checking that "val_len" is not too large but instead we check if it is smaller than the max. The only function affected would be regmap_i2c_smbus_i2c_write() in drivers/base/regmap/regmap-i2c.c. Strangely that function has its own limit check which returns an error if (count >= I2C_SMBUS_BLOCK_MAX) so it doesn't look like it has ever been able to do anything except return an error. Fixes: c335931ed9d2 ("regmap: Add raw_write/read checks for max_raw_write/read sizes") Signed-off-by: Dan Carpenter Signed-off-by: Mark Brown Cc: stable@vger.kernel.org Signed-off-by: Greg Kroah-Hartman --- drivers/base/regmap/regmap.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/base/regmap/regmap.c b/drivers/base/regmap/regmap.c index 4ac63c0e5..fd377b956 100644 --- a/drivers/base/regmap/regmap.c +++ b/drivers/base/regmap/regmap.c @@ -1582,7 +1582,7 @@ int regmap_raw_write(struct regmap *map, unsigned int reg, return -EINVAL; if (val_len % map->format.val_bytes) return -EINVAL; - if (map->max_raw_write && map->max_raw_write > val_len) + if (map->max_raw_write && map->max_raw_write < val_len) return -E2BIG; map->lock(map->lock_arg); From 872a2077b34ad2d2d613dcd43b338ed3b8a0a94a Mon Sep 17 00:00:00 2001 From: Hans de Goede Date: Mon, 19 Mar 2018 18:01:45 +0100 Subject: [PATCH 035/146] ACPI / video: Add quirk to force acpi-video backlight on Samsung 670Z5E commit bbf038618a24d72e2efc19146ef421bb1e1eda1a upstream. Just like many other Samsung models, the 670Z5E needs to use the acpi-video backlight interface rather then the native one for backlight control to work, add a quirk for this. Buglink: https://bugzilla.redhat.com/show_bug.cgi?id=1557060 Cc: All applicable Signed-off-by: Hans de Goede Signed-off-by: Rafael J. Wysocki Signed-off-by: Greg Kroah-Hartman --- drivers/acpi/video_detect.c | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/drivers/acpi/video_detect.c b/drivers/acpi/video_detect.c index b48ecbfc4..8c5503c0b 100644 --- a/drivers/acpi/video_detect.c +++ b/drivers/acpi/video_detect.c @@ -205,6 +205,15 @@ static const struct dmi_system_id video_detect_dmi_table[] = { "3570R/370R/470R/450R/510R/4450RV"), }, }, + { + /* https://bugzilla.redhat.com/show_bug.cgi?id=1557060 */ + .callback = video_detect_force_video, + .ident = "SAMSUNG 670Z5E", + .matches = { + DMI_MATCH(DMI_SYS_VENDOR, "SAMSUNG ELECTRONICS CO., LTD."), + DMI_MATCH(DMI_PRODUCT_NAME, "670Z5E"), + }, + }, { /* https://bugzilla.redhat.com/show_bug.cgi?id=1094948 */ .callback = video_detect_force_video, From c641600bbdc8c93be72b3fe5008b63177c244be1 Mon Sep 17 00:00:00 2001 From: Mika Westerberg Date: Mon, 12 Feb 2018 13:55:23 +0300 Subject: [PATCH 036/146] ACPI / hotplug / PCI: Check presence of slot itself in get_slot_status() commit 13d3047c81505cc0fb9bdae7810676e70523c8bf upstream. Mike Lothian reported that plugging in a USB-C device does not work properly in his Dell Alienware system. This system has an Intel Alpine Ridge Thunderbolt controller providing USB-C functionality. In these systems the USB controller (xHCI) is hotplugged whenever a device is connected to the port using ACPI-based hotplug. The ACPI description of the root port in question is as follows: Device (RP01) { Name (_ADR, 0x001C0000) Device (PXSX) { Name (_ADR, 0x02) Method (_RMV, 0, NotSerialized) { // ... } } Here _ADR 0x02 means device 0, function 2 on the bus under root port (RP01) but that seems to be incorrect because device 0 is the upstream port of the Alpine Ridge PCIe switch and it has no functions other than 0 (the bridge itself). When we get ACPI Notify() to the root port resulting from connecting a USB-C device, Linux tries to read PCI_VENDOR_ID from device 0, function 2 which of course always returns 0xffffffff because there is no such function and we never find the device. In Windows this works fine. Now, since we get ACPI Notify() to the root port and not to the PXSX device we should actually start our scan from there as well and not from the non-existent PXSX device. Fix this by checking presence of the slot itself (function 0) if we fail to do that otherwise. While there use pci_bus_read_dev_vendor_id() in get_slot_status(), which is the recommended way to read Device and Vendor IDs of devices on PCI buses. Link: https://bugzilla.kernel.org/show_bug.cgi?id=198557 Reported-by: Mike Lothian Signed-off-by: Mika Westerberg Signed-off-by: Bjorn Helgaas Reviewed-by: Rafael J. Wysocki Cc: Greg Kroah-Hartman Cc: stable@vger.kernel.org Signed-off-by: Greg Kroah-Hartman --- drivers/pci/hotplug/acpiphp_glue.c | 23 ++++++++++++++++------- 1 file changed, 16 insertions(+), 7 deletions(-) diff --git a/drivers/pci/hotplug/acpiphp_glue.c b/drivers/pci/hotplug/acpiphp_glue.c index 0b3e0bfa7..572ca192c 100644 --- a/drivers/pci/hotplug/acpiphp_glue.c +++ b/drivers/pci/hotplug/acpiphp_glue.c @@ -587,6 +587,7 @@ static unsigned int get_slot_status(struct acpiphp_slot *slot) { unsigned long long sta = 0; struct acpiphp_func *func; + u32 dvid; list_for_each_entry(func, &slot->funcs, sibling) { if (func->flags & FUNC_HAS_STA) { @@ -597,19 +598,27 @@ static unsigned int get_slot_status(struct acpiphp_slot *slot) if (ACPI_SUCCESS(status) && sta) break; } else { - u32 dvid; - - pci_bus_read_config_dword(slot->bus, - PCI_DEVFN(slot->device, - func->function), - PCI_VENDOR_ID, &dvid); - if (dvid != 0xffffffff) { + if (pci_bus_read_dev_vendor_id(slot->bus, + PCI_DEVFN(slot->device, func->function), + &dvid, 0)) { sta = ACPI_STA_ALL; break; } } } + if (!sta) { + /* + * Check for the slot itself since it may be that the + * ACPI slot is a device below PCIe upstream port so in + * that case it may not even be reachable yet. + */ + if (pci_bus_read_dev_vendor_id(slot->bus, + PCI_DEVFN(slot->device, 0), &dvid, 0)) { + sta = ACPI_STA_ALL; + } + } + return (unsigned int)sta; } From 18fec1a932d292a7089e88cbbc1798fa1a2b5c19 Mon Sep 17 00:00:00 2001 From: Zhengjun Xing Date: Wed, 21 Mar 2018 13:29:42 +0800 Subject: [PATCH 037/146] USB:fix USB3 devices behind USB3 hubs not resuming at hibernate thaw MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit commit 64627388b50158fd24d6ad88132525b95a5ef573 upstream. USB3 hubs don't support global suspend. USB3 specification 10.10, Enhanced SuperSpeed hubs only support selective suspend and resume, they do not support global suspend/resume where the hub downstream facing ports states are not affected. When system enters hibernation it first enters freeze process where only the root hub enters suspend, usb_port_suspend() is not called for other devices, and suspend status flags are not set for them. Other devices are expected to suspend globally. Some external USB3 hubs will suspend the downstream facing port at global suspend. These devices won't be resumed at thaw as the suspend status flag is not set. A USB3 removable hard disk connected through a USB3 hub that won't resume at thaw will fail to synchronize SCSI cache, return “cmd cmplt err -71” error, and needs a 60 seconds timeout which causing system hang for 60s before the USB host reset the port for the USB3 removable hard disk to recover. Fix this by always calling usb_port_suspend() during freeze for USB3 devices. Signed-off-by: Zhengjun Xing Cc: stable Signed-off-by: Greg Kroah-Hartman --- drivers/usb/core/generic.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/drivers/usb/core/generic.c b/drivers/usb/core/generic.c index a05431a69..f096c82d4 100644 --- a/drivers/usb/core/generic.c +++ b/drivers/usb/core/generic.c @@ -212,8 +212,13 @@ static int generic_suspend(struct usb_device *udev, pm_message_t msg) if (!udev->parent) rc = hcd_bus_suspend(udev, msg); - /* Non-root devices don't need to do anything for FREEZE or PRETHAW */ - else if (msg.event == PM_EVENT_FREEZE || msg.event == PM_EVENT_PRETHAW) + /* + * Non-root USB2 devices don't need to do anything for FREEZE + * or PRETHAW. USB3 devices don't support global suspend and + * needs to be selectively suspended. + */ + else if ((msg.event == PM_EVENT_FREEZE || msg.event == PM_EVENT_PRETHAW) + && (udev->speed < USB_SPEED_SUPER)) rc = 0; else rc = usb_port_suspend(udev, msg); From e64936054ea85feae77257972010a43794104aaf Mon Sep 17 00:00:00 2001 From: Aaron Ma Date: Mon, 8 Jan 2018 10:41:40 +0800 Subject: [PATCH 038/146] HID: i2c-hid: fix size check and type usage commit ac75a041048b8c1f7418e27621ca5efda8571043 upstream. When convert char array with signed int, if the inbuf[x] is negative then upper bits will be set to 1. Fix this by using u8 instead of char. ret_size has to be at least 3, hid_input_report use it after minus 2 bytes. Cc: stable@vger.kernel.org Signed-off-by: Aaron Ma Signed-off-by: Jiri Kosina Signed-off-by: Greg Kroah-Hartman --- drivers/hid/i2c-hid/i2c-hid.c | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/drivers/hid/i2c-hid/i2c-hid.c b/drivers/hid/i2c-hid/i2c-hid.c index 312aa1e33..4c3ed078c 100644 --- a/drivers/hid/i2c-hid/i2c-hid.c +++ b/drivers/hid/i2c-hid/i2c-hid.c @@ -137,10 +137,10 @@ struct i2c_hid { * register of the HID * descriptor. */ unsigned int bufsize; /* i2c buffer size */ - char *inbuf; /* Input buffer */ - char *rawbuf; /* Raw Input buffer */ - char *cmdbuf; /* Command buffer */ - char *argsbuf; /* Command arguments buffer */ + u8 *inbuf; /* Input buffer */ + u8 *rawbuf; /* Raw Input buffer */ + u8 *cmdbuf; /* Command buffer */ + u8 *argsbuf; /* Command arguments buffer */ unsigned long flags; /* device flags */ @@ -387,7 +387,8 @@ static int i2c_hid_hwreset(struct i2c_client *client) static void i2c_hid_get_input(struct i2c_hid *ihid) { - int ret, ret_size; + int ret; + u32 ret_size; int size = le16_to_cpu(ihid->hdesc.wMaxInputLength); if (size > ihid->bufsize) @@ -412,7 +413,7 @@ static void i2c_hid_get_input(struct i2c_hid *ihid) return; } - if (ret_size > size) { + if ((ret_size > size) || (ret_size <= 2)) { dev_err(&ihid->client->dev, "%s: incomplete report (%d/%d)\n", __func__, size, ret_size); return; From 22eb91f311d3c08f1d453b8559dc8f0604a24506 Mon Sep 17 00:00:00 2001 From: Nicholas Piggin Date: Tue, 27 Mar 2018 01:02:33 +1000 Subject: [PATCH 039/146] powerpc/powernv: Handle unknown OPAL errors in opal_nvram_write() commit 741de617661794246f84a21a02fc5e327bffc9ad upstream. opal_nvram_write currently just assumes success if it encounters an error other than OPAL_BUSY or OPAL_BUSY_EVENT. Have it return -EIO on other errors instead. Fixes: 628daa8d5abf ("powerpc/powernv: Add RTC and NVRAM support plus RTAS fallbacks") Cc: stable@vger.kernel.org # v3.2+ Signed-off-by: Nicholas Piggin Reviewed-by: Vasant Hegde Acked-by: Stewart Smith Signed-off-by: Michael Ellerman Signed-off-by: Greg Kroah-Hartman --- arch/powerpc/platforms/powernv/opal-nvram.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/arch/powerpc/platforms/powernv/opal-nvram.c b/arch/powerpc/platforms/powernv/opal-nvram.c index 9db4398de..ba2ff06a2 100644 --- a/arch/powerpc/platforms/powernv/opal-nvram.c +++ b/arch/powerpc/platforms/powernv/opal-nvram.c @@ -59,6 +59,10 @@ static ssize_t opal_nvram_write(char *buf, size_t count, loff_t *index) if (rc == OPAL_BUSY_EVENT) opal_poll_events(NULL); } + + if (rc) + return -EIO; + *index += count; return count; } From 70670c0d90794bdd4424e516f7f67a9ba158df5e Mon Sep 17 00:00:00 2001 From: Nicholas Piggin Date: Thu, 22 Mar 2018 20:41:46 +1000 Subject: [PATCH 040/146] powerpc/64: Fix smp_wmb barrier definition use use lwsync consistently commit 0bfdf598900fd62869659f360d3387ed80eb71cf upstream. asm/barrier.h is not always included after asm/synch.h, which meant it was missing __SUBARCH_HAS_LWSYNC, so in some files smp_wmb() would be eieio when it should be lwsync. kernel/time/hrtimer.c is one case. __SUBARCH_HAS_LWSYNC is only used in one place, so just fold it in to where it's used. Previously with my small simulator config, 377 instances of eieio in the tree. After this patch there are 55. Fixes: 46d075be585e ("powerpc: Optimise smp_wmb") Cc: stable@vger.kernel.org # v2.6.29+ Signed-off-by: Nicholas Piggin Signed-off-by: Michael Ellerman Signed-off-by: Greg Kroah-Hartman --- arch/powerpc/include/asm/barrier.h | 3 ++- arch/powerpc/include/asm/synch.h | 4 ---- 2 files changed, 2 insertions(+), 5 deletions(-) diff --git a/arch/powerpc/include/asm/barrier.h b/arch/powerpc/include/asm/barrier.h index 0eca6efc0..b9e16855a 100644 --- a/arch/powerpc/include/asm/barrier.h +++ b/arch/powerpc/include/asm/barrier.h @@ -36,7 +36,8 @@ #define smp_store_mb(var, value) do { WRITE_ONCE(var, value); mb(); } while (0) -#ifdef __SUBARCH_HAS_LWSYNC +/* The sub-arch has lwsync */ +#if defined(__powerpc64__) || defined(CONFIG_PPC_E500MC) # define SMPWMB LWSYNC #else # define SMPWMB eieio diff --git a/arch/powerpc/include/asm/synch.h b/arch/powerpc/include/asm/synch.h index c50868681..e8d6a842f 100644 --- a/arch/powerpc/include/asm/synch.h +++ b/arch/powerpc/include/asm/synch.h @@ -5,10 +5,6 @@ #include #include -#if defined(__powerpc64__) || defined(CONFIG_PPC_E500MC) -#define __SUBARCH_HAS_LWSYNC -#endif - #ifndef __ASSEMBLY__ extern unsigned int __start___lwsync_fixup, __stop___lwsync_fixup; extern void do_lwsync_fixups(unsigned long value, void *fixup_start, From 8878306c7cc08f7ecf952f012039f31abf52d5ca Mon Sep 17 00:00:00 2001 From: Nicholas Piggin Date: Tue, 10 Apr 2018 21:49:31 +1000 Subject: [PATCH 041/146] powerpc/powernv: define a standard delay for OPAL_BUSY type retry loops commit 34dd25de9fe3f60bfdb31b473bf04b28262d0896 upstream. This is the start of an effort to tidy up and standardise all the delays. Existing loops have a range of delay/sleep periods from 1ms to 20ms, and some have no delay. They all loop forever except rtc, which times out after 10 retries, and that uses 10ms delays. So use 10ms as our standard delay. The OPAL maintainer agrees 10ms is a reasonable starting point. The idea is to use the same recipe everywhere, once this is proven to work then it will be documented as an OPAL API standard. Then both firmware and OS can agree, and if a particular call needs something else, then that can be documented with reasoning. This is not the end-all of this effort, it's just a relatively easy change that fixes some existing high latency delays. There should be provision for standardising timeouts and/or interruptible loops where possible, so non-fatal firmware errors don't cause hangs. Signed-off-by: Nicholas Piggin Signed-off-by: Michael Ellerman Cc: Nathan Chancellor Cc: Guenter Roeck Signed-off-by: Greg Kroah-Hartman --- arch/powerpc/include/asm/opal.h | 3 +++ 1 file changed, 3 insertions(+) diff --git a/arch/powerpc/include/asm/opal.h b/arch/powerpc/include/asm/opal.h index 07a99e638..bab346111 100644 --- a/arch/powerpc/include/asm/opal.h +++ b/arch/powerpc/include/asm/opal.h @@ -21,6 +21,9 @@ /* We calculate number of sg entries based on PAGE_SIZE */ #define SG_ENTRIES_PER_NODE ((PAGE_SIZE - 16) / sizeof(struct opal_sg_entry)) +/* Default time to sleep or delay between OPAL_BUSY/OPAL_BUSY_EVENT loops */ +#define OPAL_BUSY_DELAY_MS 10 + /* /sys/firmware/opal */ extern struct kobject *opal_kobj; From 03248f779cb36bce332a509327b7476556f71888 Mon Sep 17 00:00:00 2001 From: Nicholas Piggin Date: Tue, 10 Apr 2018 21:49:33 +1000 Subject: [PATCH 042/146] powerpc/powernv: Fix OPAL NVRAM driver OPAL_BUSY loops commit 3b8070335f751aac9f1526ae2e012e6f5b8b0f21 upstream. The OPAL NVRAM driver does not sleep in case it gets OPAL_BUSY or OPAL_BUSY_EVENT from firmware, which causes large scheduling latencies, and various lockup errors to trigger (again, BMC reboot can cause it). Fix this by converting it to the standard form OPAL_BUSY loop that sleeps. Fixes: 628daa8d5abf ("powerpc/powernv: Add RTC and NVRAM support plus RTAS fallbacks") Depends-on: 34dd25de9fe3 ("powerpc/powernv: define a standard delay for OPAL_BUSY type retry loops") Cc: stable@vger.kernel.org # v3.2+ Signed-off-by: Nicholas Piggin Signed-off-by: Michael Ellerman Signed-off-by: Greg Kroah-Hartman --- arch/powerpc/platforms/powernv/opal-nvram.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/arch/powerpc/platforms/powernv/opal-nvram.c b/arch/powerpc/platforms/powernv/opal-nvram.c index ba2ff06a2..1bceb95f4 100644 --- a/arch/powerpc/platforms/powernv/opal-nvram.c +++ b/arch/powerpc/platforms/powernv/opal-nvram.c @@ -11,6 +11,7 @@ #define DEBUG +#include #include #include #include @@ -56,8 +57,12 @@ static ssize_t opal_nvram_write(char *buf, size_t count, loff_t *index) while (rc == OPAL_BUSY || rc == OPAL_BUSY_EVENT) { rc = opal_write_nvram(__pa(buf), count, off); - if (rc == OPAL_BUSY_EVENT) + if (rc == OPAL_BUSY_EVENT) { + msleep(OPAL_BUSY_DELAY_MS); opal_poll_events(NULL); + } else if (rc == OPAL_BUSY) { + msleep(OPAL_BUSY_DELAY_MS); + } } if (rc) From f9a0ece6149c817bf9d75def44e90a6013821fbf Mon Sep 17 00:00:00 2001 From: Aaron Ma Date: Sat, 3 Feb 2018 23:57:15 +0800 Subject: [PATCH 043/146] HID: Fix hid_report_len usage commit 3064a03b94e60388f0955fcc29f3e8a978d28f75 upstream. Follow the change of return type u32 of hid_report_len, fix all the types of variables those get the return value of hid_report_len to u32, and all other code already uses u32. Cc: stable@vger.kernel.org Signed-off-by: Aaron Ma Signed-off-by: Jiri Kosina Signed-off-by: Greg Kroah-Hartman --- drivers/hid/hid-input.c | 3 ++- drivers/hid/hid-multitouch.c | 5 +++-- drivers/hid/hid-rmi.c | 4 ++-- 3 files changed, 7 insertions(+), 5 deletions(-) diff --git a/drivers/hid/hid-input.c b/drivers/hid/hid-input.c index 53e54855c..8d74e691a 100644 --- a/drivers/hid/hid-input.c +++ b/drivers/hid/hid-input.c @@ -1258,7 +1258,8 @@ static void hidinput_led_worker(struct work_struct *work) led_work); struct hid_field *field; struct hid_report *report; - int len, ret; + int ret; + u32 len; __u8 *buf; field = hidinput_get_led_field(hid); diff --git a/drivers/hid/hid-multitouch.c b/drivers/hid/hid-multitouch.c index f62a9d660..9de379c1b 100644 --- a/drivers/hid/hid-multitouch.c +++ b/drivers/hid/hid-multitouch.c @@ -314,7 +314,8 @@ static struct attribute_group mt_attribute_group = { static void mt_get_feature(struct hid_device *hdev, struct hid_report *report) { struct mt_device *td = hid_get_drvdata(hdev); - int ret, size = hid_report_len(report); + int ret; + u32 size = hid_report_len(report); u8 *buf; /* @@ -919,7 +920,7 @@ static void mt_set_input_mode(struct hid_device *hdev) struct hid_report_enum *re; struct mt_class *cls = &td->mtclass; char *buf; - int report_len; + u32 report_len; if (td->inputmode < 0) return; diff --git a/drivers/hid/hid-rmi.c b/drivers/hid/hid-rmi.c index 67cd059a8..41a4a2af9 100644 --- a/drivers/hid/hid-rmi.c +++ b/drivers/hid/hid-rmi.c @@ -110,8 +110,8 @@ struct rmi_data { u8 *writeReport; u8 *readReport; - int input_report_size; - int output_report_size; + u32 input_report_size; + u32 output_report_size; unsigned long flags; From 53ce3f17431b2e8f53d4d76d968a69247ab83f00 Mon Sep 17 00:00:00 2001 From: Aaron Ma Date: Mon, 8 Jan 2018 10:41:41 +0800 Subject: [PATCH 044/146] HID: core: Fix size as type u32 commit 6de0b13cc0b4ba10e98a9263d7a83b940720b77a upstream. When size is negative, calling memset will make segment fault. Declare the size as type u32 to keep memset safe. size in struct hid_report is unsigned, fix return type of hid_report_len to u32. Cc: stable@vger.kernel.org Signed-off-by: Aaron Ma Signed-off-by: Jiri Kosina Signed-off-by: Greg Kroah-Hartman --- drivers/hid/hid-core.c | 10 +++++----- include/linux/hid.h | 6 +++--- 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/drivers/hid/hid-core.c b/drivers/hid/hid-core.c index 52fc0fdd3..9715c783b 100644 --- a/drivers/hid/hid-core.c +++ b/drivers/hid/hid-core.c @@ -1331,7 +1331,7 @@ u8 *hid_alloc_report_buf(struct hid_report *report, gfp_t flags) * of implement() working on 8 byte chunks */ - int len = hid_report_len(report) + 7; + u32 len = hid_report_len(report) + 7; return kmalloc(len, flags); } @@ -1396,7 +1396,7 @@ void __hid_request(struct hid_device *hid, struct hid_report *report, { char *buf; int ret; - int len; + u32 len; buf = hid_alloc_report_buf(report, GFP_KERNEL); if (!buf) @@ -1422,14 +1422,14 @@ void __hid_request(struct hid_device *hid, struct hid_report *report, } EXPORT_SYMBOL_GPL(__hid_request); -int hid_report_raw_event(struct hid_device *hid, int type, u8 *data, int size, +int hid_report_raw_event(struct hid_device *hid, int type, u8 *data, u32 size, int interrupt) { struct hid_report_enum *report_enum = hid->report_enum + type; struct hid_report *report; struct hid_driver *hdrv; unsigned int a; - int rsize, csize = size; + u32 rsize, csize = size; u8 *cdata = data; int ret = 0; @@ -1487,7 +1487,7 @@ EXPORT_SYMBOL_GPL(hid_report_raw_event); * * This is data entry for lower layers. */ -int hid_input_report(struct hid_device *hid, int type, u8 *data, int size, int interrupt) +int hid_input_report(struct hid_device *hid, int type, u8 *data, u32 size, int interrupt) { struct hid_report_enum *report_enum; struct hid_driver *hdrv; diff --git a/include/linux/hid.h b/include/linux/hid.h index 698f1fc8b..7127afa03 100644 --- a/include/linux/hid.h +++ b/include/linux/hid.h @@ -796,7 +796,7 @@ extern int hidinput_connect(struct hid_device *hid, unsigned int force); extern void hidinput_disconnect(struct hid_device *); int hid_set_field(struct hid_field *, unsigned, __s32); -int hid_input_report(struct hid_device *, int type, u8 *, int, int); +int hid_input_report(struct hid_device *, int type, u8 *, u32, int); int hidinput_find_field(struct hid_device *hid, unsigned int type, unsigned int code, struct hid_field **field); struct hid_field *hidinput_get_led_field(struct hid_device *hid); unsigned int hidinput_count_leds(struct hid_device *hid); @@ -1101,13 +1101,13 @@ static inline void hid_hw_wait(struct hid_device *hdev) * * @report: the report we want to know the length */ -static inline int hid_report_len(struct hid_report *report) +static inline u32 hid_report_len(struct hid_report *report) { /* equivalent to DIV_ROUND_UP(report->size, 8) + !!(report->id > 0) */ return ((report->size - 1) >> 3) + 1 + (report->id > 0); } -int hid_report_raw_event(struct hid_device *hid, int type, u8 *data, int size, +int hid_report_raw_event(struct hid_device *hid, int type, u8 *data, u32 size, int interrupt); /* HID quirks API */ From 99081e6dbaadedd43e2a6ce7320c32b3247ac53f Mon Sep 17 00:00:00 2001 From: James Kelly Date: Mon, 19 Mar 2018 21:29:50 +1100 Subject: [PATCH 045/146] ASoC: ssm2602: Replace reg_default_raw with reg_default commit a01df75ce737951ad13a08d101306e88c3f57cb2 upstream. SSM2602 driver is broken on recent kernels (at least since 4.9). User space applications such as amixer or alsamixer get EIO when attempting to access codec controls via the relevant IOCTLs. Root cause of these failures is the regcache_hw_init function in drivers/base/regmap/regcache.c, which prevents regmap cache initalization from the reg_defaults_raw element of the regmap_config structure when registers are write only. It also disables the regmap cache entirely when all registers are write only or volatile as is the case for the SSM2602 driver. Using the reg_defaults element of the regmap_config structure rather than the reg_defaults_raw element to initalize the regmap cache avoids the logic in the regcache_hw_init function entirely. It also makes this driver consistent with other ASoC codec drivers, as this driver was the ONLY codec driver that used the reg_defaults_raw element to initalize the cache. Tested on Digilent Zybo Z7 development board which has a SSM2603 codec chip connected to a Xilinx Zynq SoC. Signed-off-by: James Kelly Signed-off-by: Mark Brown Cc: stable@vger.kernel.org Signed-off-by: Greg Kroah-Hartman --- sound/soc/codecs/ssm2602.c | 19 +++++++++++++------ 1 file changed, 13 insertions(+), 6 deletions(-) diff --git a/sound/soc/codecs/ssm2602.c b/sound/soc/codecs/ssm2602.c index 4452fea0b..bd4998f57 100644 --- a/sound/soc/codecs/ssm2602.c +++ b/sound/soc/codecs/ssm2602.c @@ -54,10 +54,17 @@ struct ssm2602_priv { * using 2 wire for device control, so we cache them instead. * There is no point in caching the reset register */ -static const u16 ssm2602_reg[SSM2602_CACHEREGNUM] = { - 0x0097, 0x0097, 0x0079, 0x0079, - 0x000a, 0x0008, 0x009f, 0x000a, - 0x0000, 0x0000 +static const struct reg_default ssm2602_reg[SSM2602_CACHEREGNUM] = { + { .reg = 0x00, .def = 0x0097 }, + { .reg = 0x01, .def = 0x0097 }, + { .reg = 0x02, .def = 0x0079 }, + { .reg = 0x03, .def = 0x0079 }, + { .reg = 0x04, .def = 0x000a }, + { .reg = 0x05, .def = 0x0008 }, + { .reg = 0x06, .def = 0x009f }, + { .reg = 0x07, .def = 0x000a }, + { .reg = 0x08, .def = 0x0000 }, + { .reg = 0x09, .def = 0x0000 } }; @@ -618,8 +625,8 @@ const struct regmap_config ssm2602_regmap_config = { .volatile_reg = ssm2602_register_volatile, .cache_type = REGCACHE_RBTREE, - .reg_defaults_raw = ssm2602_reg, - .num_reg_defaults_raw = ARRAY_SIZE(ssm2602_reg), + .reg_defaults = ssm2602_reg, + .num_reg_defaults = ARRAY_SIZE(ssm2602_reg), }; EXPORT_SYMBOL_GPL(ssm2602_regmap_config); From 7c990b1c4f7cdb1379281fc50bccc44ec649e653 Mon Sep 17 00:00:00 2001 From: Mika Westerberg Date: Tue, 19 Dec 2017 12:44:56 +0300 Subject: [PATCH 046/146] thunderbolt: Resume control channel after hibernation image is created commit f2a659f7d8d5da803836583aa16df06bdf324252 upstream. The driver misses implementation of PM hook that undoes what ->freeze_noirq() does after the hibernation image is created. This means the control channel is not resumed properly and the Thunderbolt bus becomes useless in later stages of hibernation (when the image is stored or if the operation fails). Fix this by pointing ->thaw_noirq to driver nhi_resume_noirq(). This makes sure the control channel is resumed properly. Fixes: 23dd5bb49d98 ("thunderbolt: Add suspend/hibernate support") Signed-off-by: Mika Westerberg Reviewed-by: Andy Shevchenko Cc: stable@vger.kernel.org Signed-off-by: Greg Kroah-Hartman --- drivers/thunderbolt/nhi.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/thunderbolt/nhi.c b/drivers/thunderbolt/nhi.c index 20a41f7de..6713fd195 100644 --- a/drivers/thunderbolt/nhi.c +++ b/drivers/thunderbolt/nhi.c @@ -627,6 +627,7 @@ static const struct dev_pm_ops nhi_pm_ops = { * we just disable hotplug, the * pci-tunnels stay alive. */ + .thaw_noirq = nhi_resume_noirq, .restore_noirq = nhi_resume_noirq, }; From 98d2bc57a151ed05e54efdafad8c24e20ca63101 Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Mon, 19 Feb 2018 12:22:53 -0500 Subject: [PATCH 047/146] jbd2: if the journal is aborted then don't allow update of the log tail commit 85e0c4e89c1b864e763c4e3bb15d0b6d501ad5d9 upstream. This updates the jbd2 superblock unnecessarily, and on an abort we shouldn't truncate the log. Signed-off-by: Theodore Ts'o Cc: stable@vger.kernel.org Signed-off-by: Greg Kroah-Hartman --- fs/jbd2/journal.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c index 624a57a9c..4759df4eb 100644 --- a/fs/jbd2/journal.c +++ b/fs/jbd2/journal.c @@ -914,7 +914,7 @@ int __jbd2_update_log_tail(journal_t *journal, tid_t tid, unsigned long block) } /* - * This is a variaon of __jbd2_update_log_tail which checks for validity of + * This is a variation of __jbd2_update_log_tail which checks for validity of * provided log tail and locks j_checkpoint_mutex. So it is safe against races * with other threads updating log tail. */ @@ -1384,6 +1384,9 @@ int jbd2_journal_update_sb_log_tail(journal_t *journal, tid_t tail_tid, journal_superblock_t *sb = journal->j_superblock; int ret; + if (is_journal_aborted(journal)) + return -EIO; + BUG_ON(!mutex_is_locked(&journal->j_checkpoint_mutex)); jbd_debug(1, "JBD2: updating superblock (start %lu, seq %u)\n", tail_block, tail_tid); From 37e99858694e2056ef775d025421cdb7c486813e Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Mon, 19 Feb 2018 14:16:47 -0500 Subject: [PATCH 048/146] ext4: don't update checksum of new initialized bitmaps commit 044e6e3d74a3d7103a0c8a9305dfd94d64000660 upstream. When reading the inode or block allocation bitmap, if the bitmap needs to be initialized, do not update the checksum in the block group descriptor. That's because we're not set up to journal those changes. Instead, just set the verified bit on the bitmap block, so that it's not necessary to validate the checksum. When a block or inode allocation actually happens, at that point the checksum will be calculated, and update of the bg descriptor block will be properly journalled. Signed-off-by: Theodore Ts'o Cc: stable@vger.kernel.org Signed-off-by: Greg Kroah-Hartman --- fs/ext4/balloc.c | 3 +-- fs/ext4/ialloc.c | 47 +++-------------------------------------------- 2 files changed, 4 insertions(+), 46 deletions(-) diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c index f97110461..78c51ce91 100644 --- a/fs/ext4/balloc.c +++ b/fs/ext4/balloc.c @@ -242,8 +242,6 @@ static int ext4_init_block_bitmap(struct super_block *sb, */ ext4_mark_bitmap_end(num_clusters_in_group(sb, block_group), sb->s_blocksize * 8, bh->b_data); - ext4_block_bitmap_csum_set(sb, block_group, gdp, bh); - ext4_group_desc_csum_set(sb, block_group, gdp); return 0; } @@ -447,6 +445,7 @@ ext4_read_block_bitmap_nowait(struct super_block *sb, ext4_group_t block_group) err = ext4_init_block_bitmap(sb, bh, block_group, desc); set_bitmap_uptodate(bh); set_buffer_uptodate(bh); + set_buffer_verified(bh); ext4_unlock_group(sb, block_group); unlock_buffer(bh); if (err) { diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c index 5388207d2..e10c12f59 100644 --- a/fs/ext4/ialloc.c +++ b/fs/ext4/ialloc.c @@ -63,44 +63,6 @@ void ext4_mark_bitmap_end(int start_bit, int end_bit, char *bitmap) memset(bitmap + (i >> 3), 0xff, (end_bit - i) >> 3); } -/* Initializes an uninitialized inode bitmap */ -static int ext4_init_inode_bitmap(struct super_block *sb, - struct buffer_head *bh, - ext4_group_t block_group, - struct ext4_group_desc *gdp) -{ - struct ext4_group_info *grp; - struct ext4_sb_info *sbi = EXT4_SB(sb); - J_ASSERT_BH(bh, buffer_locked(bh)); - - /* If checksum is bad mark all blocks and inodes use to prevent - * allocation, essentially implementing a per-group read-only flag. */ - if (!ext4_group_desc_csum_verify(sb, block_group, gdp)) { - grp = ext4_get_group_info(sb, block_group); - if (!EXT4_MB_GRP_BBITMAP_CORRUPT(grp)) - percpu_counter_sub(&sbi->s_freeclusters_counter, - grp->bb_free); - set_bit(EXT4_GROUP_INFO_BBITMAP_CORRUPT_BIT, &grp->bb_state); - if (!EXT4_MB_GRP_IBITMAP_CORRUPT(grp)) { - int count; - count = ext4_free_inodes_count(sb, gdp); - percpu_counter_sub(&sbi->s_freeinodes_counter, - count); - } - set_bit(EXT4_GROUP_INFO_IBITMAP_CORRUPT_BIT, &grp->bb_state); - return -EFSBADCRC; - } - - memset(bh->b_data, 0, (EXT4_INODES_PER_GROUP(sb) + 7) / 8); - ext4_mark_bitmap_end(EXT4_INODES_PER_GROUP(sb), sb->s_blocksize * 8, - bh->b_data); - ext4_inode_bitmap_csum_set(sb, block_group, gdp, bh, - EXT4_INODES_PER_GROUP(sb) / 8); - ext4_group_desc_csum_set(sb, block_group, gdp); - - return 0; -} - void ext4_end_bitmap_read(struct buffer_head *bh, int uptodate) { if (uptodate) { @@ -184,17 +146,14 @@ ext4_read_inode_bitmap(struct super_block *sb, ext4_group_t block_group) ext4_lock_group(sb, block_group); if (desc->bg_flags & cpu_to_le16(EXT4_BG_INODE_UNINIT)) { - err = ext4_init_inode_bitmap(sb, bh, block_group, desc); + memset(bh->b_data, 0, (EXT4_INODES_PER_GROUP(sb) + 7) / 8); + ext4_mark_bitmap_end(EXT4_INODES_PER_GROUP(sb), + sb->s_blocksize * 8, bh->b_data); set_bitmap_uptodate(bh); set_buffer_uptodate(bh); set_buffer_verified(bh); ext4_unlock_group(sb, block_group); unlock_buffer(bh); - if (err) { - ext4_error(sb, "Failed to init inode bitmap for group " - "%u: %d", block_group, err); - goto out; - } return bh; } ext4_unlock_group(sb, block_group); From 3cb7f40ba5a0f8c1b15067dafc24c79404f8ba5d Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Thu, 29 Mar 2018 21:56:09 -0400 Subject: [PATCH 049/146] ext4: fail ext4_iget for root directory if unallocated commit 8e4b5eae5decd9dfe5a4ee369c22028f90ab4c44 upstream. If the root directory has an i_links_count of zero, then when the file system is mounted, then when ext4_fill_super() notices the problem and tries to call iput() the root directory in the error return path, ext4_evict_inode() will try to free the inode on disk, before all of the file system structures are set up, and this will result in an OOPS caused by a NULL pointer dereference. This issue has been assigned CVE-2018-1092. https://bugzilla.kernel.org/show_bug.cgi?id=199179 https://bugzilla.redhat.com/show_bug.cgi?id=1560777 Reported-by: Wen Xu Signed-off-by: Theodore Ts'o Cc: stable@vger.kernel.org Signed-off-by: Greg Kroah-Hartman --- fs/ext4/inode.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index d6d4235a4..ca9a7d98d 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -4273,6 +4273,12 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino) goto bad_inode; raw_inode = ext4_raw_inode(&iloc); + if ((ino == EXT4_ROOT_INO) && (raw_inode->i_links_count == 0)) { + EXT4_ERROR_INODE(inode, "root inode unallocated"); + ret = -EFSCORRUPTED; + goto bad_inode; + } + if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE) { ei->i_extra_isize = le16_to_cpu(raw_inode->i_extra_isize); if (EXT4_GOOD_OLD_INODE_SIZE + ei->i_extra_isize > From 3951dc40b83758ae6be303f48f10f0d06eb56ddf Mon Sep 17 00:00:00 2001 From: Roland Dreier Date: Tue, 3 Apr 2018 15:33:01 -0700 Subject: [PATCH 050/146] RDMA/ucma: Don't allow setting RDMA_OPTION_IB_PATH without an RDMA device commit 8435168d50e66fa5eae01852769d20a36f9e5e83 upstream. Check to make sure that ctx->cm_id->device is set before we use it. Otherwise userspace can trigger a NULL dereference by doing RDMA_USER_CM_CMD_SET_OPTION on an ID that is not bound to a device. Cc: Reported-by: Signed-off-by: Roland Dreier Signed-off-by: Jason Gunthorpe Signed-off-by: Greg Kroah-Hartman --- drivers/infiniband/core/ucma.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/infiniband/core/ucma.c b/drivers/infiniband/core/ucma.c index 960fcb613..ea3bc9bb1 100644 --- a/drivers/infiniband/core/ucma.c +++ b/drivers/infiniband/core/ucma.c @@ -1230,6 +1230,9 @@ static int ucma_set_ib_path(struct ucma_context *ctx, if (!optlen) return -EINVAL; + if (!ctx->cm_id->device) + return -EINVAL; + memset(&sa_path, 0, sizeof(sa_path)); ib_sa_unpack_path(path_data->path_rec, &sa_path); From 1cd391e2a7d75ddd83b9d9cbb9169830bd5af8e8 Mon Sep 17 00:00:00 2001 From: Takashi Iwai Date: Mon, 2 Apr 2018 22:41:43 +0200 Subject: [PATCH 051/146] ALSA: pcm: Fix UAF at PCM release via PCM timer access commit a820ccbe21e8ce8e86c39cd1d3bc8c7d1cbb949b upstream. The PCM runtime object is created and freed dynamically at PCM stream open / close time. This is tracked via substream->runtime, and it's cleared at snd_pcm_detach_substream(). The runtime object assignment is protected by PCM open_mutex, so for all PCM operations, it's safely handled. However, each PCM substream provides also an ALSA timer interface, and user-space can access to this while closing a PCM substream. This may eventually lead to a UAF, as snd_pcm_timer_resolution() tries to access the runtime while clearing it in other side. Fortunately, it's the only concurrent access from the PCM timer, and it merely reads runtime->timer_resolution field. So, we can avoid the race by reordering kfree() and wrapping the substream->runtime clearance with the corresponding timer lock. Reported-by: syzbot+8e62ff4e07aa2ce87826@syzkaller.appspotmail.com Cc: Signed-off-by: Takashi Iwai Signed-off-by: Greg Kroah-Hartman --- sound/core/pcm.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/sound/core/pcm.c b/sound/core/pcm.c index 074363b63..6bda8f6c5 100644 --- a/sound/core/pcm.c +++ b/sound/core/pcm.c @@ -28,6 +28,7 @@ #include #include #include +#include #include #include @@ -1025,8 +1026,13 @@ void snd_pcm_detach_substream(struct snd_pcm_substream *substream) snd_free_pages((void*)runtime->control, PAGE_ALIGN(sizeof(struct snd_pcm_mmap_control))); kfree(runtime->hw_constraints.rules); - kfree(runtime); + /* Avoid concurrent access to runtime via PCM timer interface */ + if (substream->timer) + spin_lock_irq(&substream->timer->lock); substream->runtime = NULL; + if (substream->timer) + spin_unlock_irq(&substream->timer->lock); + kfree(runtime); put_pid(substream->pid); substream->pid = NULL; substream->pstr->substream_opened--; From fd86611b3a988201b82688988e43b5063a3df407 Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Fri, 23 Feb 2018 14:09:24 -0800 Subject: [PATCH 052/146] IB/srp: Fix srp_abort() commit e68088e78d82920632eba112b968e49d588d02a2 upstream. Before commit e494f6a72839 ("[SCSI] improved eh timeout handler") it did not really matter whether or not abort handlers like srp_abort() called .scsi_done() when returning another value than SUCCESS. Since that commit however this matters. Hence only call .scsi_done() when returning SUCCESS. Signed-off-by: Bart Van Assche Cc: stable@vger.kernel.org Signed-off-by: Jason Gunthorpe Signed-off-by: Greg Kroah-Hartman --- drivers/infiniband/ulp/srp/ib_srp.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/drivers/infiniband/ulp/srp/ib_srp.c b/drivers/infiniband/ulp/srp/ib_srp.c index 9a99cee26..ba51debef 100644 --- a/drivers/infiniband/ulp/srp/ib_srp.c +++ b/drivers/infiniband/ulp/srp/ib_srp.c @@ -2581,9 +2581,11 @@ static int srp_abort(struct scsi_cmnd *scmnd) ret = FAST_IO_FAIL; else ret = FAILED; - srp_free_req(ch, req, scmnd, 0); - scmnd->result = DID_ABORT << 16; - scmnd->scsi_done(scmnd); + if (ret == SUCCESS) { + srp_free_req(ch, req, scmnd, 0); + scmnd->result = DID_ABORT << 16; + scmnd->scsi_done(scmnd); + } return ret; } From 3fe0ee0596d5407f707bcaeae19e603821ccbb6b Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Mon, 12 Feb 2018 09:50:25 -0800 Subject: [PATCH 053/146] IB/srp: Fix completion vector assignment algorithm commit 3a148896b24adf8688dc0c59af54531931677a40 upstream. Ensure that cv_end is equal to ibdev->num_comp_vectors for the NUMA node with the highest index. This patch improves spreading of RDMA channels over completion vectors and thereby improves performance, especially on systems with only a single NUMA node. This patch drops support for the comp_vector login parameter by ignoring the value of that parameter since I have not found a good way to combine support for that parameter and automatic spreading of RDMA channels over completion vectors. Fixes: d92c0da71a35 ("IB/srp: Add multichannel support") Reported-by: Alexander Schmid Signed-off-by: Bart Van Assche Cc: Alexander Schmid Cc: stable@vger.kernel.org Signed-off-by: Jason Gunthorpe Signed-off-by: Greg Kroah-Hartman --- drivers/infiniband/ulp/srp/ib_srp.c | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/drivers/infiniband/ulp/srp/ib_srp.c b/drivers/infiniband/ulp/srp/ib_srp.c index ba51debef..4fd289261 100644 --- a/drivers/infiniband/ulp/srp/ib_srp.c +++ b/drivers/infiniband/ulp/srp/ib_srp.c @@ -3311,12 +3311,10 @@ static ssize_t srp_create_target(struct device *dev, num_online_nodes()); const int ch_end = ((node_idx + 1) * target->ch_count / num_online_nodes()); - const int cv_start = (node_idx * ibdev->num_comp_vectors / - num_online_nodes() + target->comp_vector) - % ibdev->num_comp_vectors; - const int cv_end = ((node_idx + 1) * ibdev->num_comp_vectors / - num_online_nodes() + target->comp_vector) - % ibdev->num_comp_vectors; + const int cv_start = node_idx * ibdev->num_comp_vectors / + num_online_nodes(); + const int cv_end = (node_idx + 1) * ibdev->num_comp_vectors / + num_online_nodes(); int cpu_idx = 0; for_each_online_cpu(cpu) { From b325b61ce4db9d335097574e1dfb6f127d266de3 Mon Sep 17 00:00:00 2001 From: Maxime Jayat Date: Thu, 22 Feb 2018 12:39:55 +0100 Subject: [PATCH 054/146] dmaengine: at_xdmac: fix rare residue corruption commit c5637476bbf9bb86c7f0413b8f4822a73d8d2d07 upstream. Despite the efforts made to correctly read the NDA and CUBC registers, the order in which the registers are read could sometimes lead to an inconsistent state. Re-using the timeline from the comments, this following timing of registers reads could lead to reading NDA with value "@desc2" and CUBC with value "MAX desc1": INITD -------- ------------ |____________________| _______________________ _______________ NDA @desc2 \/ @desc3 _______________________/\_______________ __________ ___________ _______________ CUBC 0 \/ MAX desc1 \/ MAX desc2 __________/\___________/\_______________ | | | | Events:(1)(2) (3)(4) (1) check_nda = @desc2 (2) initd = 1 (3) cur_ubc = MAX desc1 (4) cur_nda = @desc2 This is allowed by the condition ((check_nda == cur_nda) && initd), despite cur_ubc and cur_nda being in the precise state we don't want. This error leads to incorrect residue computation. Fix it by inversing the order in which CUBC and INITD are read. This makes sure that NDA and CUBC are always read together either _before_ INITD goes to 0 or _after_ it is back at 1. The case where NDA is read before INITD is at 0 and CUBC is read after INITD is back at 1 will be rejected by check_nda and cur_nda being different. Fixes: 53398f488821 ("dmaengine: at_xdmac: fix residue corruption") Cc: stable@vger.kernel.org Signed-off-by: Maxime Jayat Acked-by: Ludovic Desroches Signed-off-by: Vinod Koul Signed-off-by: Greg Kroah-Hartman --- drivers/dma/at_xdmac.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/dma/at_xdmac.c b/drivers/dma/at_xdmac.c index 66c073fc8..82a7c89ca 100644 --- a/drivers/dma/at_xdmac.c +++ b/drivers/dma/at_xdmac.c @@ -1473,10 +1473,10 @@ at_xdmac_tx_status(struct dma_chan *chan, dma_cookie_t cookie, for (retry = 0; retry < AT_XDMAC_RESIDUE_MAX_RETRIES; retry++) { check_nda = at_xdmac_chan_read(atchan, AT_XDMAC_CNDA) & 0xfffffffc; rmb(); - initd = !!(at_xdmac_chan_read(atchan, AT_XDMAC_CC) & AT_XDMAC_CC_INITD); - rmb(); cur_ubc = at_xdmac_chan_read(atchan, AT_XDMAC_CUBC); rmb(); + initd = !!(at_xdmac_chan_read(atchan, AT_XDMAC_CC) & AT_XDMAC_CC_INITD); + rmb(); cur_nda = at_xdmac_chan_read(atchan, AT_XDMAC_CNDA) & 0xfffffffc; rmb(); From b853de548df114eb7fa00eae51ca1270ed502ff7 Mon Sep 17 00:00:00 2001 From: Krzysztof Mazur Date: Wed, 15 Nov 2017 11:12:39 +0100 Subject: [PATCH 055/146] um: Use POSIX ucontext_t instead of struct ucontext commit 4d1a535b8ec5e74b42dfd9dc809142653b2597f6 upstream. glibc 2.26 removed the 'struct ucontext' to "improve" POSIX compliance and break programs, including User Mode Linux. Fix User Mode Linux by using POSIX ucontext_t. This fixes: arch/um/os-Linux/signal.c: In function 'hard_handler': arch/um/os-Linux/signal.c:163:22: error: dereferencing pointer to incomplete type 'struct ucontext' mcontext_t *mc = &uc->uc_mcontext; arch/x86/um/stub_segv.c: In function 'stub_segv_handler': arch/x86/um/stub_segv.c:16:13: error: dereferencing pointer to incomplete type 'struct ucontext' &uc->uc_mcontext); Cc: stable@vger.kernel.org Signed-off-by: Krzysztof Mazur Signed-off-by: Richard Weinberger Signed-off-by: Greg Kroah-Hartman --- arch/um/os-Linux/signal.c | 2 +- arch/x86/um/stub_segv.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/um/os-Linux/signal.c b/arch/um/os-Linux/signal.c index c211153ca..56648f4f8 100644 --- a/arch/um/os-Linux/signal.c +++ b/arch/um/os-Linux/signal.c @@ -140,7 +140,7 @@ static void (*handlers[_NSIG])(int sig, struct siginfo *si, mcontext_t *mc) = { static void hard_handler(int sig, siginfo_t *si, void *p) { - struct ucontext *uc = p; + ucontext_t *uc = p; mcontext_t *mc = &uc->uc_mcontext; unsigned long pending = 1UL << sig; diff --git a/arch/x86/um/stub_segv.c b/arch/x86/um/stub_segv.c index 1518d2805..fd6825537 100644 --- a/arch/x86/um/stub_segv.c +++ b/arch/x86/um/stub_segv.c @@ -10,7 +10,7 @@ void __attribute__ ((__section__ (".__syscall_stub"))) stub_segv_handler(int sig, siginfo_t *info, void *p) { - struct ucontext *uc = p; + ucontext_t *uc = p; GET_FAULTINFO_FROM_MC(*((struct faultinfo *) STUB_DATA), &uc->uc_mcontext); From d62773908d051f81626875d56f3f718cf51afeb5 Mon Sep 17 00:00:00 2001 From: Lu Baolu Date: Sat, 24 Feb 2018 13:42:27 +0800 Subject: [PATCH 056/146] iommu/vt-d: Fix a potential memory leak commit bbe4b3af9d9e3172fb9aa1f8dcdfaedcb381fc64 upstream. A memory block was allocated in intel_svm_bind_mm() but never freed in a failure path. This patch fixes this by free it to avoid memory leakage. Cc: Ashok Raj Cc: Jacob Pan Cc: # v4.4+ Signed-off-by: Lu Baolu Fixes: 2f26e0a9c9860 ('iommu/vt-d: Add basic SVM PASID support') Signed-off-by: Joerg Roedel Signed-off-by: Greg Kroah-Hartman --- drivers/iommu/intel-svm.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/iommu/intel-svm.c b/drivers/iommu/intel-svm.c index a7d516f97..10068a481 100644 --- a/drivers/iommu/intel-svm.c +++ b/drivers/iommu/intel-svm.c @@ -389,6 +389,7 @@ int intel_svm_bind_mm(struct device *dev, int *pasid, int flags, struct svm_dev_ pasid_max - 1, GFP_KERNEL); if (ret < 0) { kfree(svm); + kfree(sdev); goto out; } svm->pasid = ret; From 0a0521d8793ef75e17fee3ef9dbddd0a0319a953 Mon Sep 17 00:00:00 2001 From: Alex Smith Date: Wed, 28 Mar 2018 18:00:43 -0300 Subject: [PATCH 057/146] mmc: jz4740: Fix race condition in IRQ mask update commit a04f0017c22453613d5f423326b190c61e3b4f98 upstream. A spinlock is held while updating the internal copy of the IRQ mask, but not while writing it to the actual IMASK register. After the lock is released, an IRQ can occur before the IMASK register is written. If handling this IRQ causes the mask to be changed, when the handler returns back to the middle of the first mask update, a stale value will be written to the mask register. If this causes an IRQ to become unmasked that cannot have its status cleared by writing a 1 to it in the IREG register, e.g. the SDIO IRQ, then we can end up stuck with the same IRQ repeatedly being fired but not handled. Normally the MMC IRQ handler attempts to clear any unexpected IRQs by writing IREG, but for those that cannot be cleared in this way then the IRQ will just repeatedly fire. This was resulting in lockups after a while of using Wi-Fi on the CI20 (GitHub issue #19). Resolve by holding the spinlock until after the IMASK register has been updated. Cc: stable@vger.kernel.org Link: https://github.com/MIPS/CI20_linux/issues/19 Fixes: 61bfbdb85687 ("MMC: Add support for the controller on JZ4740 SoCs.") Tested-by: Mathieu Malaterre Signed-off-by: Alex Smith Signed-off-by: Ulf Hansson Signed-off-by: Greg Kroah-Hartman --- drivers/mmc/host/jz4740_mmc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/mmc/host/jz4740_mmc.c b/drivers/mmc/host/jz4740_mmc.c index 76e8bce6f..ad572a0f2 100644 --- a/drivers/mmc/host/jz4740_mmc.c +++ b/drivers/mmc/host/jz4740_mmc.c @@ -368,9 +368,9 @@ static void jz4740_mmc_set_irq_enabled(struct jz4740_mmc_host *host, host->irq_mask &= ~irq; else host->irq_mask |= irq; - spin_unlock_irqrestore(&host->lock, flags); writew(host->irq_mask, host->base + JZ_REG_MMC_IMASK); + spin_unlock_irqrestore(&host->lock, flags); } static void jz4740_mmc_clock_enable(struct jz4740_mmc_host *host, From 67fac592a546dae29de2edcdaa281dcd9ce39fe4 Mon Sep 17 00:00:00 2001 From: Ralph Sennhauser Date: Wed, 24 May 2017 16:58:52 +0200 Subject: [PATCH 058/146] clk: mvebu: armada-38x: add support for 1866MHz variants commit 9593f4f56cf5d1c443f66660a0c7f01de38f979d upstream. The Linksys WRT3200ACM CPU is clocked at 1866MHz. Add 1866MHz to the list of supported CPU frequencies. Also update multiplier and divisor for the l2clk and ddrclk. Noticed by the following warning: [ 0.000000] Selected CPU frequency (16) unsupported Signed-off-by: Ralph Sennhauser Reviewed-by: Gregory CLEMENT Signed-off-by: Stephen Boyd Signed-off-by: Greg Kroah-Hartman --- drivers/clk/mvebu/armada-38x.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/drivers/clk/mvebu/armada-38x.c b/drivers/clk/mvebu/armada-38x.c index 8bccf4ecd..394aa6f03 100644 --- a/drivers/clk/mvebu/armada-38x.c +++ b/drivers/clk/mvebu/armada-38x.c @@ -49,7 +49,8 @@ static const u32 armada_38x_cpu_frequencies[] __initconst = { 0, 0, 0, 0, 1066 * 1000 * 1000, 0, 0, 0, 1332 * 1000 * 1000, 0, 0, 0, - 1600 * 1000 * 1000, + 1600 * 1000 * 1000, 0, 0, 0, + 1866 * 1000 * 1000, }; static u32 __init armada_38x_get_cpu_freq(void __iomem *sar) @@ -79,7 +80,7 @@ static const int armada_38x_cpu_l2_ratios[32][2] __initconst = { {1, 2}, {0, 1}, {0, 1}, {0, 1}, {1, 2}, {0, 1}, {0, 1}, {0, 1}, {1, 2}, {0, 1}, {0, 1}, {0, 1}, - {0, 1}, {0, 1}, {0, 1}, {0, 1}, + {1, 2}, {0, 1}, {0, 1}, {0, 1}, {0, 1}, {0, 1}, {0, 1}, {0, 1}, {0, 1}, {0, 1}, {0, 1}, {0, 1}, {0, 1}, {0, 1}, {0, 1}, {0, 1}, @@ -90,7 +91,7 @@ static const int armada_38x_cpu_ddr_ratios[32][2] __initconst = { {1, 2}, {0, 1}, {0, 1}, {0, 1}, {1, 2}, {0, 1}, {0, 1}, {0, 1}, {1, 2}, {0, 1}, {0, 1}, {0, 1}, - {0, 1}, {0, 1}, {0, 1}, {0, 1}, + {1, 2}, {0, 1}, {0, 1}, {0, 1}, {0, 1}, {0, 1}, {0, 1}, {0, 1}, {0, 1}, {0, 1}, {0, 1}, {0, 1}, {0, 1}, {0, 1}, {0, 1}, {0, 1}, From d39fa1fefc5077dd86e2c3502b9aa497200e585f Mon Sep 17 00:00:00 2001 From: Richard Genoud Date: Tue, 13 Mar 2018 16:27:02 +0100 Subject: [PATCH 059/146] clk: mvebu: armada-38x: add support for missing clocks commit 6a4a4595804548e173f0763a0e7274a3521c59a9 upstream. Clearfog boards can come with a CPU clocked at 1600MHz (commercial) or 1333MHz (industrial). They have also some dip-switches to select a different clock (666, 800, 1066, 1200). The funny thing is that the recovery button is on the MPP34 fq selector. So, when booting an industrial board with this button down, the frequency 666MHz is selected (and the kernel didn't boot). This patch add all the missing clocks. The only mode I didn't test is 2GHz (uboot found 4294MHz instead :/ ). Fixes: 0e85aeced4d6 ("clk: mvebu: add clock support for Armada 380/385") Cc: # 3.16.x: 9593f4f56cf5: clk: mvebu: armada-38x: add support for 1866MHz variants Cc: # 3.16.x Signed-off-by: Richard Genoud Acked-by: Gregory CLEMENT Signed-off-by: Stephen Boyd Signed-off-by: Greg Kroah-Hartman --- drivers/clk/mvebu/armada-38x.c | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/drivers/clk/mvebu/armada-38x.c b/drivers/clk/mvebu/armada-38x.c index 394aa6f03..9ff4ea639 100644 --- a/drivers/clk/mvebu/armada-38x.c +++ b/drivers/clk/mvebu/armada-38x.c @@ -46,11 +46,11 @@ static u32 __init armada_38x_get_tclk_freq(void __iomem *sar) } static const u32 armada_38x_cpu_frequencies[] __initconst = { - 0, 0, 0, 0, - 1066 * 1000 * 1000, 0, 0, 0, + 666 * 1000 * 1000, 0, 800 * 1000 * 1000, 0, + 1066 * 1000 * 1000, 0, 1200 * 1000 * 1000, 0, 1332 * 1000 * 1000, 0, 0, 0, 1600 * 1000 * 1000, 0, 0, 0, - 1866 * 1000 * 1000, + 1866 * 1000 * 1000, 0, 0, 2000 * 1000 * 1000, }; static u32 __init armada_38x_get_cpu_freq(void __iomem *sar) @@ -76,11 +76,11 @@ static const struct coreclk_ratio armada_38x_coreclk_ratios[] __initconst = { }; static const int armada_38x_cpu_l2_ratios[32][2] __initconst = { - {0, 1}, {0, 1}, {0, 1}, {0, 1}, - {1, 2}, {0, 1}, {0, 1}, {0, 1}, - {1, 2}, {0, 1}, {0, 1}, {0, 1}, + {1, 2}, {0, 1}, {1, 2}, {0, 1}, + {1, 2}, {0, 1}, {1, 2}, {0, 1}, {1, 2}, {0, 1}, {0, 1}, {0, 1}, {1, 2}, {0, 1}, {0, 1}, {0, 1}, + {1, 2}, {0, 1}, {0, 1}, {1, 2}, {0, 1}, {0, 1}, {0, 1}, {0, 1}, {0, 1}, {0, 1}, {0, 1}, {0, 1}, {0, 1}, {0, 1}, {0, 1}, {0, 1}, @@ -91,7 +91,7 @@ static const int armada_38x_cpu_ddr_ratios[32][2] __initconst = { {1, 2}, {0, 1}, {0, 1}, {0, 1}, {1, 2}, {0, 1}, {0, 1}, {0, 1}, {1, 2}, {0, 1}, {0, 1}, {0, 1}, - {1, 2}, {0, 1}, {0, 1}, {0, 1}, + {1, 2}, {0, 1}, {0, 1}, {7, 15}, {0, 1}, {0, 1}, {0, 1}, {0, 1}, {0, 1}, {0, 1}, {0, 1}, {0, 1}, {0, 1}, {0, 1}, {0, 1}, {0, 1}, From cf75e6771dc9203f67478ffd1ecab2118d3c20f5 Mon Sep 17 00:00:00 2001 From: Boris Brezillon Date: Thu, 22 Mar 2018 10:11:30 +0100 Subject: [PATCH 060/146] clk: bcm2835: De-assert/assert PLL reset signal when appropriate commit 753872373b599384ac7df809aa61ea12d1c4d5d1 upstream. In order to enable a PLL, not only the PLL has to be powered up and locked, but you also have to de-assert the reset signal. The last part was missing. Add it so PLLs that were not enabled by the FW/bootloader can be enabled from Linux. Fixes: 41691b8862e2 ("clk: bcm2835: Add support for programming the audio domain clocks") Cc: Signed-off-by: Boris Brezillon Reviewed-by: Eric Anholt Signed-off-by: Stephen Boyd Signed-off-by: Greg Kroah-Hartman --- drivers/clk/bcm/clk-bcm2835.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/drivers/clk/bcm/clk-bcm2835.c b/drivers/clk/bcm/clk-bcm2835.c index 7c4b1ffe8..d56ba46e6 100644 --- a/drivers/clk/bcm/clk-bcm2835.c +++ b/drivers/clk/bcm/clk-bcm2835.c @@ -891,9 +891,7 @@ static void bcm2835_pll_off(struct clk_hw *hw) const struct bcm2835_pll_data *data = pll->data; spin_lock(&cprman->regs_lock); - cprman_write(cprman, data->cm_ctrl_reg, - cprman_read(cprman, data->cm_ctrl_reg) | - CM_PLL_ANARST); + cprman_write(cprman, data->cm_ctrl_reg, CM_PLL_ANARST); cprman_write(cprman, data->a2w_ctrl_reg, cprman_read(cprman, data->a2w_ctrl_reg) | A2W_PLL_CTRL_PWRDN); @@ -929,6 +927,10 @@ static int bcm2835_pll_on(struct clk_hw *hw) cpu_relax(); } + cprman_write(cprman, data->a2w_ctrl_reg, + cprman_read(cprman, data->a2w_ctrl_reg) | + A2W_PLL_CTRL_PRST_DISABLE); + return 0; } From 87841ea7f761752bcecfd7e0106b380a04dc58a0 Mon Sep 17 00:00:00 2001 From: Mikhail Lappo Date: Fri, 2 Feb 2018 16:17:46 -0200 Subject: [PATCH 061/146] thermal: imx: Fix race condition in imx_thermal_probe() commit cf1ba1d73a33944d8c1a75370a35434bf146b8a7 upstream. When device boots with T > T_trip_1 and requests interrupt, the race condition takes place. The interrupt comes before THERMAL_DEVICE_ENABLED is set. This leads to an attempt to reading sensor value from irq and disabling the sensor, based on the data->mode field, which expected to be THERMAL_DEVICE_ENABLED, but still stays as THERMAL_DEVICE_DISABLED. Afher this issue sensor is never re-enabled, as the driver state is wrong. Fix this problem by setting the 'data' members prior to requesting the interrupts. Fixes: 37713a1e8e4c ("thermal: imx: implement thermal alarm interrupt handling") Cc: Signed-off-by: Mikhail Lappo Signed-off-by: Fabio Estevam Reviewed-by: Philipp Zabel Acked-by: Dong Aisheng Signed-off-by: Zhang Rui Signed-off-by: Greg Kroah-Hartman --- drivers/thermal/imx_thermal.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/thermal/imx_thermal.c b/drivers/thermal/imx_thermal.c index c5547bd71..6a8300108 100644 --- a/drivers/thermal/imx_thermal.c +++ b/drivers/thermal/imx_thermal.c @@ -589,6 +589,9 @@ static int imx_thermal_probe(struct platform_device *pdev) regmap_write(map, TEMPSENSE0 + REG_CLR, TEMPSENSE0_POWER_DOWN); regmap_write(map, TEMPSENSE0 + REG_SET, TEMPSENSE0_MEASURE_TEMP); + data->irq_enabled = true; + data->mode = THERMAL_DEVICE_ENABLED; + ret = devm_request_threaded_irq(&pdev->dev, data->irq, imx_thermal_alarm_irq, imx_thermal_alarm_irq_thread, 0, "imx_thermal", data); @@ -600,9 +603,6 @@ static int imx_thermal_probe(struct platform_device *pdev) return ret; } - data->irq_enabled = true; - data->mode = THERMAL_DEVICE_ENABLED; - return 0; } From b5d2de85035f14d1c5a9dfa3147fa535fe8dbfdb Mon Sep 17 00:00:00 2001 From: Igor Pylypiv Date: Tue, 6 Mar 2018 23:47:25 -0800 Subject: [PATCH 062/146] watchdog: f71808e_wdt: Fix WD_EN register read commit 977f6f68331f94bb72ad84ee96b7b87ce737d89d upstream. F71808FG_FLAG_WD_EN defines bit position, not a bitmask Signed-off-by: Igor Pylypiv Reviewed-by: Guenter Roeck Signed-off-by: Guenter Roeck Signed-off-by: Wim Van Sebroeck Cc: stable Signed-off-by: Greg Kroah-Hartman --- drivers/watchdog/f71808e_wdt.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/watchdog/f71808e_wdt.c b/drivers/watchdog/f71808e_wdt.c index 016bd9355..aa93df583 100644 --- a/drivers/watchdog/f71808e_wdt.c +++ b/drivers/watchdog/f71808e_wdt.c @@ -450,7 +450,7 @@ static bool watchdog_is_running(void) is_running = (superio_inb(watchdog.sioaddr, SIO_REG_ENABLE) & BIT(0)) && (superio_inb(watchdog.sioaddr, F71808FG_REG_WDT_CONF) - & F71808FG_FLAG_WD_EN); + & BIT(F71808FG_FLAG_WD_EN)); superio_exit(watchdog.sioaddr); From 7f05fd4108f818f8eff8d5a7835714c497f1f097 Mon Sep 17 00:00:00 2001 From: Nicholas Mc Guire Date: Sat, 19 Dec 2015 15:23:13 +0100 Subject: [PATCH 063/146] ALSA: oss: consolidate kmalloc/memset 0 call to kzalloc commit 46325371b230cc66c743925c930a17e7d0b8211e upstream. This is an API consolidation only. The use of kmalloc + memset to 0 is equivalent to kzalloc. Signed-off-by: Nicholas Mc Guire Signed-off-by: Takashi Iwai Signed-off-by: Greg Kroah-Hartman --- sound/core/oss/pcm_oss.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/sound/core/oss/pcm_oss.c b/sound/core/oss/pcm_oss.c index 6cd8aec14..672f3acfc 100644 --- a/sound/core/oss/pcm_oss.c +++ b/sound/core/oss/pcm_oss.c @@ -853,7 +853,7 @@ static int snd_pcm_oss_change_params(struct snd_pcm_substream *substream, return -EAGAIN; } else if (mutex_lock_interruptible(&runtime->oss.params_lock)) return -EINTR; - sw_params = kmalloc(sizeof(*sw_params), GFP_KERNEL); + sw_params = kzalloc(sizeof(*sw_params), GFP_KERNEL); params = kmalloc(sizeof(*params), GFP_KERNEL); sparams = kmalloc(sizeof(*sparams), GFP_KERNEL); if (!sw_params || !params || !sparams) { @@ -991,7 +991,6 @@ static int snd_pcm_oss_change_params(struct snd_pcm_substream *substream, goto failure; } - memset(sw_params, 0, sizeof(*sw_params)); if (runtime->oss.trigger) { sw_params->start_threshold = 1; } else { From ade7e3661ef582adabc111042bae830a1910ede3 Mon Sep 17 00:00:00 2001 From: Takashi Iwai Date: Tue, 9 Jan 2018 08:51:02 +0100 Subject: [PATCH 064/146] ALSA: pcm: Use ERESTARTSYS instead of EINTR in OSS emulation commit c64ed5dd9feba193c76eb460b451225ac2a0d87b upstream. Fix the last standing EINTR in the whole subsystem. Use more correct ERESTARTSYS for pending signals. Signed-off-by: Takashi Iwai Signed-off-by: Greg Kroah-Hartman --- sound/core/oss/pcm_oss.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sound/core/oss/pcm_oss.c b/sound/core/oss/pcm_oss.c index 672f3acfc..3e22f1412 100644 --- a/sound/core/oss/pcm_oss.c +++ b/sound/core/oss/pcm_oss.c @@ -852,7 +852,7 @@ static int snd_pcm_oss_change_params(struct snd_pcm_substream *substream, if (!(mutex_trylock(&runtime->oss.params_lock))) return -EAGAIN; } else if (mutex_lock_interruptible(&runtime->oss.params_lock)) - return -EINTR; + return -ERESTARTSYS; sw_params = kzalloc(sizeof(*sw_params), GFP_KERNEL); params = kmalloc(sizeof(*params), GFP_KERNEL); sparams = kmalloc(sizeof(*sparams), GFP_KERNEL); From 3f7ee12f1c85f61c11de927b65d6d2244831278a Mon Sep 17 00:00:00 2001 From: Takashi Iwai Date: Thu, 22 Mar 2018 18:10:14 +0100 Subject: [PATCH 065/146] ALSA: pcm: Avoid potential races between OSS ioctls and read/write commit 02a5d6925cd34c3b774bdb8eefb057c40a30e870 upstream. Although we apply the params_lock mutex to the whole read and write operations as well as snd_pcm_oss_change_params(), we may still face some races. First off, the params_lock is taken inside the read and write loop. This is intentional for avoiding the too long locking, but it allows the in-between parameter change, which might lead to invalid pointers. We check the readiness of the stream and set up via snd_pcm_oss_make_ready() at the beginning of read and write, but it's called only once, by assuming that it remains ready in the rest. Second, many ioctls that may change the actual parameters (i.e. setting runtime->oss.params=1) aren't protected, hence they can be processed in a half-baked state. This patch is an attempt to plug these holes. The stream readiness check is moved inside the read/write inner loop, so that the stream is always set up in a proper state before further processing. Also, each ioctl that may change the parameter is wrapped with the params_lock for avoiding the races. The issues were triggered by syzkaller in a few different scenarios, particularly the one below appearing as GPF in loopback_pos_update. Reported-by: syzbot+c4227aec125487ec3efa@syzkaller.appspotmail.com Cc: Signed-off-by: Takashi Iwai Signed-off-by: Greg Kroah-Hartman --- sound/core/oss/pcm_oss.c | 134 +++++++++++++++++++++++++++++++-------- 1 file changed, 106 insertions(+), 28 deletions(-) diff --git a/sound/core/oss/pcm_oss.c b/sound/core/oss/pcm_oss.c index 3e22f1412..ff90bc4e9 100644 --- a/sound/core/oss/pcm_oss.c +++ b/sound/core/oss/pcm_oss.c @@ -833,8 +833,8 @@ static int choose_rate(struct snd_pcm_substream *substream, return snd_pcm_hw_param_near(substream, params, SNDRV_PCM_HW_PARAM_RATE, best_rate, NULL); } -static int snd_pcm_oss_change_params(struct snd_pcm_substream *substream, - bool trylock) +/* call with params_lock held */ +static int snd_pcm_oss_change_params_locked(struct snd_pcm_substream *substream) { struct snd_pcm_runtime *runtime = substream->runtime; struct snd_pcm_hw_params *params, *sparams; @@ -848,11 +848,8 @@ static int snd_pcm_oss_change_params(struct snd_pcm_substream *substream, struct snd_mask sformat_mask; struct snd_mask mask; - if (trylock) { - if (!(mutex_trylock(&runtime->oss.params_lock))) - return -EAGAIN; - } else if (mutex_lock_interruptible(&runtime->oss.params_lock)) - return -ERESTARTSYS; + if (!runtime->oss.params) + return 0; sw_params = kzalloc(sizeof(*sw_params), GFP_KERNEL); params = kmalloc(sizeof(*params), GFP_KERNEL); sparams = kmalloc(sizeof(*sparams), GFP_KERNEL); @@ -1078,6 +1075,23 @@ static int snd_pcm_oss_change_params(struct snd_pcm_substream *substream, kfree(sw_params); kfree(params); kfree(sparams); + return err; +} + +/* this one takes the lock by itself */ +static int snd_pcm_oss_change_params(struct snd_pcm_substream *substream, + bool trylock) +{ + struct snd_pcm_runtime *runtime = substream->runtime; + int err; + + if (trylock) { + if (!(mutex_trylock(&runtime->oss.params_lock))) + return -EAGAIN; + } else if (mutex_lock_interruptible(&runtime->oss.params_lock)) + return -ERESTARTSYS; + + err = snd_pcm_oss_change_params_locked(substream); mutex_unlock(&runtime->oss.params_lock); return err; } @@ -1106,11 +1120,14 @@ static int snd_pcm_oss_get_active_substream(struct snd_pcm_oss_file *pcm_oss_fil return 0; } +/* call with params_lock held */ static int snd_pcm_oss_prepare(struct snd_pcm_substream *substream) { int err; struct snd_pcm_runtime *runtime = substream->runtime; + if (!runtime->oss.prepare) + return 0; err = snd_pcm_kernel_ioctl(substream, SNDRV_PCM_IOCTL_PREPARE, NULL); if (err < 0) { pcm_dbg(substream->pcm, @@ -1130,14 +1147,35 @@ static int snd_pcm_oss_make_ready(struct snd_pcm_substream *substream) struct snd_pcm_runtime *runtime; int err; - if (substream == NULL) - return 0; runtime = substream->runtime; if (runtime->oss.params) { err = snd_pcm_oss_change_params(substream, false); if (err < 0) return err; } + if (runtime->oss.prepare) { + if (mutex_lock_interruptible(&runtime->oss.params_lock)) + return -ERESTARTSYS; + err = snd_pcm_oss_prepare(substream); + mutex_unlock(&runtime->oss.params_lock); + if (err < 0) + return err; + } + return 0; +} + +/* call with params_lock held */ +static int snd_pcm_oss_make_ready_locked(struct snd_pcm_substream *substream) +{ + struct snd_pcm_runtime *runtime; + int err; + + runtime = substream->runtime; + if (runtime->oss.params) { + err = snd_pcm_oss_change_params_locked(substream); + if (err < 0) + return err; + } if (runtime->oss.prepare) { err = snd_pcm_oss_prepare(substream); if (err < 0) @@ -1366,13 +1404,14 @@ static ssize_t snd_pcm_oss_write1(struct snd_pcm_substream *substream, const cha if (atomic_read(&substream->mmap_count)) return -ENXIO; - if ((tmp = snd_pcm_oss_make_ready(substream)) < 0) - return tmp; while (bytes > 0) { if (mutex_lock_interruptible(&runtime->oss.params_lock)) { tmp = -ERESTARTSYS; break; } + tmp = snd_pcm_oss_make_ready_locked(substream); + if (tmp < 0) + goto err; if (bytes < runtime->oss.period_bytes || runtime->oss.buffer_used > 0) { tmp = bytes; if (tmp + runtime->oss.buffer_used > runtime->oss.period_bytes) @@ -1473,13 +1512,14 @@ static ssize_t snd_pcm_oss_read1(struct snd_pcm_substream *substream, char __use if (atomic_read(&substream->mmap_count)) return -ENXIO; - if ((tmp = snd_pcm_oss_make_ready(substream)) < 0) - return tmp; while (bytes > 0) { if (mutex_lock_interruptible(&runtime->oss.params_lock)) { tmp = -ERESTARTSYS; break; } + tmp = snd_pcm_oss_make_ready_locked(substream); + if (tmp < 0) + goto err; if (bytes < runtime->oss.period_bytes || runtime->oss.buffer_used > 0) { if (runtime->oss.buffer_used == 0) { tmp = snd_pcm_oss_read2(substream, runtime->oss.buffer, runtime->oss.period_bytes, 1); @@ -1535,10 +1575,12 @@ static int snd_pcm_oss_reset(struct snd_pcm_oss_file *pcm_oss_file) continue; runtime = substream->runtime; snd_pcm_kernel_ioctl(substream, SNDRV_PCM_IOCTL_DROP, NULL); + mutex_lock(&runtime->oss.params_lock); runtime->oss.prepare = 1; runtime->oss.buffer_used = 0; runtime->oss.prev_hw_ptr_period = 0; runtime->oss.period_ptr = 0; + mutex_unlock(&runtime->oss.params_lock); } return 0; } @@ -1624,9 +1666,10 @@ static int snd_pcm_oss_sync(struct snd_pcm_oss_file *pcm_oss_file) goto __direct; if ((err = snd_pcm_oss_make_ready(substream)) < 0) return err; + if (mutex_lock_interruptible(&runtime->oss.params_lock)) + return -ERESTARTSYS; format = snd_pcm_oss_format_from(runtime->oss.format); width = snd_pcm_format_physical_width(format); - mutex_lock(&runtime->oss.params_lock); if (runtime->oss.buffer_used > 0) { #ifdef OSS_DEBUG pcm_dbg(substream->pcm, "sync: buffer_used\n"); @@ -1694,7 +1737,9 @@ static int snd_pcm_oss_sync(struct snd_pcm_oss_file *pcm_oss_file) substream->f_flags = saved_f_flags; if (err < 0) return err; + mutex_lock(&runtime->oss.params_lock); runtime->oss.prepare = 1; + mutex_unlock(&runtime->oss.params_lock); } substream = pcm_oss_file->streams[SNDRV_PCM_STREAM_CAPTURE]; @@ -1705,8 +1750,10 @@ static int snd_pcm_oss_sync(struct snd_pcm_oss_file *pcm_oss_file) err = snd_pcm_kernel_ioctl(substream, SNDRV_PCM_IOCTL_DROP, NULL); if (err < 0) return err; + mutex_lock(&runtime->oss.params_lock); runtime->oss.buffer_used = 0; runtime->oss.prepare = 1; + mutex_unlock(&runtime->oss.params_lock); } return 0; } @@ -1725,10 +1772,13 @@ static int snd_pcm_oss_set_rate(struct snd_pcm_oss_file *pcm_oss_file, int rate) rate = 1000; else if (rate > 192000) rate = 192000; + if (mutex_lock_interruptible(&runtime->oss.params_lock)) + return -ERESTARTSYS; if (runtime->oss.rate != rate) { runtime->oss.params = 1; runtime->oss.rate = rate; } + mutex_unlock(&runtime->oss.params_lock); } return snd_pcm_oss_get_rate(pcm_oss_file); } @@ -1756,10 +1806,13 @@ static int snd_pcm_oss_set_channels(struct snd_pcm_oss_file *pcm_oss_file, unsig if (substream == NULL) continue; runtime = substream->runtime; + if (mutex_lock_interruptible(&runtime->oss.params_lock)) + return -ERESTARTSYS; if (runtime->oss.channels != channels) { runtime->oss.params = 1; runtime->oss.channels = channels; } + mutex_unlock(&runtime->oss.params_lock); } return snd_pcm_oss_get_channels(pcm_oss_file); } @@ -1845,10 +1898,13 @@ static int snd_pcm_oss_set_format(struct snd_pcm_oss_file *pcm_oss_file, int for if (substream == NULL) continue; runtime = substream->runtime; + if (mutex_lock_interruptible(&runtime->oss.params_lock)) + return -ERESTARTSYS; if (runtime->oss.format != format) { runtime->oss.params = 1; runtime->oss.format = format; } + mutex_unlock(&runtime->oss.params_lock); } } return snd_pcm_oss_get_format(pcm_oss_file); @@ -1868,8 +1924,6 @@ static int snd_pcm_oss_set_subdivide1(struct snd_pcm_substream *substream, int s { struct snd_pcm_runtime *runtime; - if (substream == NULL) - return 0; runtime = substream->runtime; if (subdivide == 0) { subdivide = runtime->oss.subdivision; @@ -1893,9 +1947,16 @@ static int snd_pcm_oss_set_subdivide(struct snd_pcm_oss_file *pcm_oss_file, int for (idx = 1; idx >= 0; --idx) { struct snd_pcm_substream *substream = pcm_oss_file->streams[idx]; + struct snd_pcm_runtime *runtime; + if (substream == NULL) continue; - if ((err = snd_pcm_oss_set_subdivide1(substream, subdivide)) < 0) + runtime = substream->runtime; + if (mutex_lock_interruptible(&runtime->oss.params_lock)) + return -ERESTARTSYS; + err = snd_pcm_oss_set_subdivide1(substream, subdivide); + mutex_unlock(&runtime->oss.params_lock); + if (err < 0) return err; } return err; @@ -1905,8 +1966,6 @@ static int snd_pcm_oss_set_fragment1(struct snd_pcm_substream *substream, unsign { struct snd_pcm_runtime *runtime; - if (substream == NULL) - return 0; runtime = substream->runtime; if (runtime->oss.subdivision || runtime->oss.fragshift) return -EINVAL; @@ -1926,9 +1985,16 @@ static int snd_pcm_oss_set_fragment(struct snd_pcm_oss_file *pcm_oss_file, unsig for (idx = 1; idx >= 0; --idx) { struct snd_pcm_substream *substream = pcm_oss_file->streams[idx]; + struct snd_pcm_runtime *runtime; + if (substream == NULL) continue; - if ((err = snd_pcm_oss_set_fragment1(substream, val)) < 0) + runtime = substream->runtime; + if (mutex_lock_interruptible(&runtime->oss.params_lock)) + return -ERESTARTSYS; + err = snd_pcm_oss_set_fragment1(substream, val); + mutex_unlock(&runtime->oss.params_lock); + if (err < 0) return err; } return err; @@ -2012,6 +2078,9 @@ static int snd_pcm_oss_set_trigger(struct snd_pcm_oss_file *pcm_oss_file, int tr } if (psubstream) { runtime = psubstream->runtime; + cmd = 0; + if (mutex_lock_interruptible(&runtime->oss.params_lock)) + return -ERESTARTSYS; if (trigger & PCM_ENABLE_OUTPUT) { if (runtime->oss.trigger) goto _skip1; @@ -2029,13 +2098,19 @@ static int snd_pcm_oss_set_trigger(struct snd_pcm_oss_file *pcm_oss_file, int tr cmd = SNDRV_PCM_IOCTL_DROP; runtime->oss.prepare = 1; } - err = snd_pcm_kernel_ioctl(psubstream, cmd, NULL); - if (err < 0) - return err; - } _skip1: + mutex_unlock(&runtime->oss.params_lock); + if (cmd) { + err = snd_pcm_kernel_ioctl(psubstream, cmd, NULL); + if (err < 0) + return err; + } + } if (csubstream) { runtime = csubstream->runtime; + cmd = 0; + if (mutex_lock_interruptible(&runtime->oss.params_lock)) + return -ERESTARTSYS; if (trigger & PCM_ENABLE_INPUT) { if (runtime->oss.trigger) goto _skip2; @@ -2050,11 +2125,14 @@ static int snd_pcm_oss_set_trigger(struct snd_pcm_oss_file *pcm_oss_file, int tr cmd = SNDRV_PCM_IOCTL_DROP; runtime->oss.prepare = 1; } - err = snd_pcm_kernel_ioctl(csubstream, cmd, NULL); - if (err < 0) - return err; - } _skip2: + mutex_unlock(&runtime->oss.params_lock); + if (cmd) { + err = snd_pcm_kernel_ioctl(csubstream, cmd, NULL); + if (err < 0) + return err; + } + } return 0; } From 3c32bf76d0549cee03894004a02a71de8b51d1fc Mon Sep 17 00:00:00 2001 From: Takashi Iwai Date: Fri, 23 Mar 2018 08:03:26 +0100 Subject: [PATCH 066/146] ALSA: pcm: Return -EBUSY for OSS ioctls changing busy streams commit 40cab6e88cb0b6c56d3f30b7491a20e803f948f6 upstream. OSS PCM stream management isn't modal but it allows ioctls issued at any time for changing the parameters. In the previous hardening patch ("ALSA: pcm: Avoid potential races between OSS ioctls and read/write"), we covered these races and prevent the corruption by protecting the concurrent accesses via params_lock mutex. However, this means that some ioctls that try to change the stream parameter (e.g. channels or format) would be blocked until the read/write finishes, and it may take really long. Basically changing the parameter while reading/writing is an invalid operation, hence it's even more user-friendly from the API POV if it returns -EBUSY in such a situation. This patch adds such checks in the relevant ioctls with the addition of read/write access refcount. Cc: Signed-off-by: Takashi Iwai Signed-off-by: Greg Kroah-Hartman --- include/sound/pcm_oss.h | 1 + sound/core/oss/pcm_oss.c | 36 +++++++++++++++++++++++++++--------- 2 files changed, 28 insertions(+), 9 deletions(-) diff --git a/include/sound/pcm_oss.h b/include/sound/pcm_oss.h index 760c969d8..12bbf8c81 100644 --- a/include/sound/pcm_oss.h +++ b/include/sound/pcm_oss.h @@ -57,6 +57,7 @@ struct snd_pcm_oss_runtime { char *buffer; /* vmallocated period */ size_t buffer_used; /* used length from period buffer */ struct mutex params_lock; + atomic_t rw_ref; /* concurrent read/write accesses */ #ifdef CONFIG_SND_PCM_OSS_PLUGINS struct snd_pcm_plugin *plugin_first; struct snd_pcm_plugin *plugin_last; diff --git a/sound/core/oss/pcm_oss.c b/sound/core/oss/pcm_oss.c index ff90bc4e9..d17b1709e 100644 --- a/sound/core/oss/pcm_oss.c +++ b/sound/core/oss/pcm_oss.c @@ -1404,6 +1404,7 @@ static ssize_t snd_pcm_oss_write1(struct snd_pcm_substream *substream, const cha if (atomic_read(&substream->mmap_count)) return -ENXIO; + atomic_inc(&runtime->oss.rw_ref); while (bytes > 0) { if (mutex_lock_interruptible(&runtime->oss.params_lock)) { tmp = -ERESTARTSYS; @@ -1467,6 +1468,7 @@ static ssize_t snd_pcm_oss_write1(struct snd_pcm_substream *substream, const cha } tmp = 0; } + atomic_dec(&runtime->oss.rw_ref); return xfer > 0 ? (snd_pcm_sframes_t)xfer : tmp; } @@ -1512,6 +1514,7 @@ static ssize_t snd_pcm_oss_read1(struct snd_pcm_substream *substream, char __use if (atomic_read(&substream->mmap_count)) return -ENXIO; + atomic_inc(&runtime->oss.rw_ref); while (bytes > 0) { if (mutex_lock_interruptible(&runtime->oss.params_lock)) { tmp = -ERESTARTSYS; @@ -1560,6 +1563,7 @@ static ssize_t snd_pcm_oss_read1(struct snd_pcm_substream *substream, char __use } tmp = 0; } + atomic_dec(&runtime->oss.rw_ref); return xfer > 0 ? (snd_pcm_sframes_t)xfer : tmp; } @@ -1666,8 +1670,11 @@ static int snd_pcm_oss_sync(struct snd_pcm_oss_file *pcm_oss_file) goto __direct; if ((err = snd_pcm_oss_make_ready(substream)) < 0) return err; - if (mutex_lock_interruptible(&runtime->oss.params_lock)) + atomic_inc(&runtime->oss.rw_ref); + if (mutex_lock_interruptible(&runtime->oss.params_lock)) { + atomic_dec(&runtime->oss.rw_ref); return -ERESTARTSYS; + } format = snd_pcm_oss_format_from(runtime->oss.format); width = snd_pcm_format_physical_width(format); if (runtime->oss.buffer_used > 0) { @@ -1679,10 +1686,8 @@ static int snd_pcm_oss_sync(struct snd_pcm_oss_file *pcm_oss_file) runtime->oss.buffer + runtime->oss.buffer_used, size); err = snd_pcm_oss_sync1(substream, runtime->oss.period_bytes); - if (err < 0) { - mutex_unlock(&runtime->oss.params_lock); - return err; - } + if (err < 0) + goto unlock; } else if (runtime->oss.period_ptr > 0) { #ifdef OSS_DEBUG pcm_dbg(substream->pcm, "sync: period_ptr\n"); @@ -1692,10 +1697,8 @@ static int snd_pcm_oss_sync(struct snd_pcm_oss_file *pcm_oss_file) runtime->oss.buffer, size * 8 / width); err = snd_pcm_oss_sync1(substream, size); - if (err < 0) { - mutex_unlock(&runtime->oss.params_lock); - return err; - } + if (err < 0) + goto unlock; } /* * The ALSA's period might be a bit large than OSS one. @@ -1726,7 +1729,11 @@ static int snd_pcm_oss_sync(struct snd_pcm_oss_file *pcm_oss_file) snd_pcm_lib_writev(substream, buffers, size); } } +unlock: mutex_unlock(&runtime->oss.params_lock); + atomic_dec(&runtime->oss.rw_ref); + if (err < 0) + return err; /* * finish sync: drain the buffer */ @@ -1774,6 +1781,8 @@ static int snd_pcm_oss_set_rate(struct snd_pcm_oss_file *pcm_oss_file, int rate) rate = 192000; if (mutex_lock_interruptible(&runtime->oss.params_lock)) return -ERESTARTSYS; + if (atomic_read(&runtime->oss.rw_ref)) + return -EBUSY; if (runtime->oss.rate != rate) { runtime->oss.params = 1; runtime->oss.rate = rate; @@ -1808,6 +1817,8 @@ static int snd_pcm_oss_set_channels(struct snd_pcm_oss_file *pcm_oss_file, unsig runtime = substream->runtime; if (mutex_lock_interruptible(&runtime->oss.params_lock)) return -ERESTARTSYS; + if (atomic_read(&runtime->oss.rw_ref)) + return -EBUSY; if (runtime->oss.channels != channels) { runtime->oss.params = 1; runtime->oss.channels = channels; @@ -1898,6 +1909,8 @@ static int snd_pcm_oss_set_format(struct snd_pcm_oss_file *pcm_oss_file, int for if (substream == NULL) continue; runtime = substream->runtime; + if (atomic_read(&runtime->oss.rw_ref)) + return -EBUSY; if (mutex_lock_interruptible(&runtime->oss.params_lock)) return -ERESTARTSYS; if (runtime->oss.format != format) { @@ -1952,6 +1965,8 @@ static int snd_pcm_oss_set_subdivide(struct snd_pcm_oss_file *pcm_oss_file, int if (substream == NULL) continue; runtime = substream->runtime; + if (atomic_read(&runtime->oss.rw_ref)) + return -EBUSY; if (mutex_lock_interruptible(&runtime->oss.params_lock)) return -ERESTARTSYS; err = snd_pcm_oss_set_subdivide1(substream, subdivide); @@ -1990,6 +2005,8 @@ static int snd_pcm_oss_set_fragment(struct snd_pcm_oss_file *pcm_oss_file, unsig if (substream == NULL) continue; runtime = substream->runtime; + if (atomic_read(&runtime->oss.rw_ref)) + return -EBUSY; if (mutex_lock_interruptible(&runtime->oss.params_lock)) return -ERESTARTSYS; err = snd_pcm_oss_set_fragment1(substream, val); @@ -2384,6 +2401,7 @@ static void snd_pcm_oss_init_substream(struct snd_pcm_substream *substream, runtime->oss.maxfrags = 0; runtime->oss.subdivision = 0; substream->pcm_release = snd_pcm_oss_release_substream; + atomic_set(&runtime->oss.rw_ref, 0); } static int snd_pcm_oss_release_file(struct snd_pcm_oss_file *pcm_oss_file) From e61d8da78fd0f3ccbbed16fdaf08b20bf02b0cf4 Mon Sep 17 00:00:00 2001 From: Takashi Iwai Date: Tue, 27 Mar 2018 14:32:23 +0200 Subject: [PATCH 067/146] ALSA: pcm: Fix mutex unbalance in OSS emulation ioctls commit f6d297df4dd47ef949540e4a201230d0c5308325 upstream. The previous fix 40cab6e88cb0 ("ALSA: pcm: Return -EBUSY for OSS ioctls changing busy streams") introduced some mutex unbalance; the check of runtime->oss.rw_ref was inserted in a wrong place after the mutex lock. This patch fixes the inconsistency by rewriting with the helper functions to lock/unlock parameters with the stream check. Fixes: 40cab6e88cb0 ("ALSA: pcm: Return -EBUSY for OSS ioctls changing busy streams") Reported-by: Dan Carpenter Cc: Signed-off-by: Takashi Iwai Signed-off-by: Greg Kroah-Hartman --- sound/core/oss/pcm_oss.c | 67 +++++++++++++++++++++++++--------------- 1 file changed, 42 insertions(+), 25 deletions(-) diff --git a/sound/core/oss/pcm_oss.c b/sound/core/oss/pcm_oss.c index d17b1709e..9df14f789 100644 --- a/sound/core/oss/pcm_oss.c +++ b/sound/core/oss/pcm_oss.c @@ -833,6 +833,23 @@ static int choose_rate(struct snd_pcm_substream *substream, return snd_pcm_hw_param_near(substream, params, SNDRV_PCM_HW_PARAM_RATE, best_rate, NULL); } +/* parameter locking: returns immediately if tried during streaming */ +static int lock_params(struct snd_pcm_runtime *runtime) +{ + if (mutex_lock_interruptible(&runtime->oss.params_lock)) + return -ERESTARTSYS; + if (atomic_read(&runtime->oss.rw_ref)) { + mutex_unlock(&runtime->oss.params_lock); + return -EBUSY; + } + return 0; +} + +static void unlock_params(struct snd_pcm_runtime *runtime) +{ + mutex_unlock(&runtime->oss.params_lock); +} + /* call with params_lock held */ static int snd_pcm_oss_change_params_locked(struct snd_pcm_substream *substream) { @@ -1772,6 +1789,8 @@ static int snd_pcm_oss_set_rate(struct snd_pcm_oss_file *pcm_oss_file, int rate) for (idx = 1; idx >= 0; --idx) { struct snd_pcm_substream *substream = pcm_oss_file->streams[idx]; struct snd_pcm_runtime *runtime; + int err; + if (substream == NULL) continue; runtime = substream->runtime; @@ -1779,15 +1798,14 @@ static int snd_pcm_oss_set_rate(struct snd_pcm_oss_file *pcm_oss_file, int rate) rate = 1000; else if (rate > 192000) rate = 192000; - if (mutex_lock_interruptible(&runtime->oss.params_lock)) - return -ERESTARTSYS; - if (atomic_read(&runtime->oss.rw_ref)) - return -EBUSY; + err = lock_params(runtime); + if (err < 0) + return err; if (runtime->oss.rate != rate) { runtime->oss.params = 1; runtime->oss.rate = rate; } - mutex_unlock(&runtime->oss.params_lock); + unlock_params(runtime); } return snd_pcm_oss_get_rate(pcm_oss_file); } @@ -1812,18 +1830,19 @@ static int snd_pcm_oss_set_channels(struct snd_pcm_oss_file *pcm_oss_file, unsig for (idx = 1; idx >= 0; --idx) { struct snd_pcm_substream *substream = pcm_oss_file->streams[idx]; struct snd_pcm_runtime *runtime; + int err; + if (substream == NULL) continue; runtime = substream->runtime; - if (mutex_lock_interruptible(&runtime->oss.params_lock)) - return -ERESTARTSYS; - if (atomic_read(&runtime->oss.rw_ref)) - return -EBUSY; + err = lock_params(runtime); + if (err < 0) + return err; if (runtime->oss.channels != channels) { runtime->oss.params = 1; runtime->oss.channels = channels; } - mutex_unlock(&runtime->oss.params_lock); + unlock_params(runtime); } return snd_pcm_oss_get_channels(pcm_oss_file); } @@ -1896,6 +1915,7 @@ static int snd_pcm_oss_get_formats(struct snd_pcm_oss_file *pcm_oss_file) static int snd_pcm_oss_set_format(struct snd_pcm_oss_file *pcm_oss_file, int format) { int formats, idx; + int err; if (format != AFMT_QUERY) { formats = snd_pcm_oss_get_formats(pcm_oss_file); @@ -1909,15 +1929,14 @@ static int snd_pcm_oss_set_format(struct snd_pcm_oss_file *pcm_oss_file, int for if (substream == NULL) continue; runtime = substream->runtime; - if (atomic_read(&runtime->oss.rw_ref)) - return -EBUSY; - if (mutex_lock_interruptible(&runtime->oss.params_lock)) - return -ERESTARTSYS; + err = lock_params(runtime); + if (err < 0) + return err; if (runtime->oss.format != format) { runtime->oss.params = 1; runtime->oss.format = format; } - mutex_unlock(&runtime->oss.params_lock); + unlock_params(runtime); } } return snd_pcm_oss_get_format(pcm_oss_file); @@ -1965,12 +1984,11 @@ static int snd_pcm_oss_set_subdivide(struct snd_pcm_oss_file *pcm_oss_file, int if (substream == NULL) continue; runtime = substream->runtime; - if (atomic_read(&runtime->oss.rw_ref)) - return -EBUSY; - if (mutex_lock_interruptible(&runtime->oss.params_lock)) - return -ERESTARTSYS; + err = lock_params(runtime); + if (err < 0) + return err; err = snd_pcm_oss_set_subdivide1(substream, subdivide); - mutex_unlock(&runtime->oss.params_lock); + unlock_params(runtime); if (err < 0) return err; } @@ -2005,12 +2023,11 @@ static int snd_pcm_oss_set_fragment(struct snd_pcm_oss_file *pcm_oss_file, unsig if (substream == NULL) continue; runtime = substream->runtime; - if (atomic_read(&runtime->oss.rw_ref)) - return -EBUSY; - if (mutex_lock_interruptible(&runtime->oss.params_lock)) - return -ERESTARTSYS; + err = lock_params(runtime); + if (err < 0) + return err; err = snd_pcm_oss_set_fragment1(substream, val); - mutex_unlock(&runtime->oss.params_lock); + unlock_params(runtime); if (err < 0) return err; } From 4e8e49e2da00bff262f3f9cc4b953b27c07b5ecb Mon Sep 17 00:00:00 2001 From: Takashi Iwai Date: Sat, 7 Apr 2018 11:48:58 +0200 Subject: [PATCH 068/146] ALSA: pcm: Fix endless loop for XRUN recovery in OSS emulation commit e15dc99dbb9cf99f6432e8e3c0b3a8f7a3403a86 upstream. The commit 02a5d6925cd3 ("ALSA: pcm: Avoid potential races between OSS ioctls and read/write") split the PCM preparation code to a locked version, and it added a sanity check of runtime->oss.prepare flag along with the change. This leaded to an endless loop when the stream gets XRUN: namely, snd_pcm_oss_write3() and co call snd_pcm_oss_prepare() without setting runtime->oss.prepare flag and the loop continues until the PCM state reaches to another one. As the function is supposed to execute the preparation unconditionally, drop the invalid state check there. The bug was triggered by syzkaller. Fixes: 02a5d6925cd3 ("ALSA: pcm: Avoid potential races between OSS ioctls and read/write") Reported-by: syzbot+150189c103427d31a053@syzkaller.appspotmail.com Reported-by: syzbot+7e3f31a52646f939c052@syzkaller.appspotmail.com Reported-by: syzbot+4f2016cf5185da7759dc@syzkaller.appspotmail.com Cc: Signed-off-by: Takashi Iwai Signed-off-by: Greg Kroah-Hartman --- sound/core/oss/pcm_oss.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/sound/core/oss/pcm_oss.c b/sound/core/oss/pcm_oss.c index 9df14f789..07feb35f1 100644 --- a/sound/core/oss/pcm_oss.c +++ b/sound/core/oss/pcm_oss.c @@ -1138,13 +1138,14 @@ static int snd_pcm_oss_get_active_substream(struct snd_pcm_oss_file *pcm_oss_fil } /* call with params_lock held */ +/* NOTE: this always call PREPARE unconditionally no matter whether + * runtime->oss.prepare is set or not + */ static int snd_pcm_oss_prepare(struct snd_pcm_substream *substream) { int err; struct snd_pcm_runtime *runtime = substream->runtime; - if (!runtime->oss.prepare) - return 0; err = snd_pcm_kernel_ioctl(substream, SNDRV_PCM_IOCTL_PREPARE, NULL); if (err < 0) { pcm_dbg(substream->pcm, From d404f247084cc4f26e49a160b320a9cfd5d6565a Mon Sep 17 00:00:00 2001 From: Alex Williamson Date: Mon, 26 Sep 2016 13:52:16 -0600 Subject: [PATCH 069/146] vfio-pci: Virtualize PCIe & AF FLR commit ddf9dc0eb5314d6dac8b19b1cc37c739c6896e7e upstream. We use a BAR restore trick to try to detect when a user has performed a device reset, possibly through FLR or other backdoors, to put things back into a working state. This is important for backdoor resets, but we can actually just virtualize the "front door" resets provided via PCIe and AF FLR. Set these bits as virtualized + writable, allowing the default write to set them in vconfig, then we can simply check the bit, perform an FLR of our own, and clear the bit. We don't actually have the granularity in PCI to specify the type of reset we want to do, but generally devices don't implement both PCIe and AF FLR and we'll favor these over other types of reset, so we should generally lineup. We do test whether the device provides the requested FLR type to stay consistent with hardware capabilities though. This seems to fix several instance of devices getting into bad states with userspace drivers, like dpdk, running inside a VM. Signed-off-by: Alex Williamson Reviewed-by: Greg Rose Signed-off-by: Greg Kroah-Hartman --- drivers/vfio/pci/vfio_pci_config.c | 82 ++++++++++++++++++++++++++++-- 1 file changed, 77 insertions(+), 5 deletions(-) diff --git a/drivers/vfio/pci/vfio_pci_config.c b/drivers/vfio/pci/vfio_pci_config.c index fe2b470d7..7a6b85f76 100644 --- a/drivers/vfio/pci/vfio_pci_config.c +++ b/drivers/vfio/pci/vfio_pci_config.c @@ -752,6 +752,40 @@ static int __init init_pci_cap_pcix_perm(struct perm_bits *perm) return 0; } +static int vfio_exp_config_write(struct vfio_pci_device *vdev, int pos, + int count, struct perm_bits *perm, + int offset, __le32 val) +{ + __le16 *ctrl = (__le16 *)(vdev->vconfig + pos - + offset + PCI_EXP_DEVCTL); + + count = vfio_default_config_write(vdev, pos, count, perm, offset, val); + if (count < 0) + return count; + + /* + * The FLR bit is virtualized, if set and the device supports PCIe + * FLR, issue a reset_function. Regardless, clear the bit, the spec + * requires it to be always read as zero. NB, reset_function might + * not use a PCIe FLR, we don't have that level of granularity. + */ + if (*ctrl & cpu_to_le16(PCI_EXP_DEVCTL_BCR_FLR)) { + u32 cap; + int ret; + + *ctrl &= ~cpu_to_le16(PCI_EXP_DEVCTL_BCR_FLR); + + ret = pci_user_read_config_dword(vdev->pdev, + pos - offset + PCI_EXP_DEVCAP, + &cap); + + if (!ret && (cap & PCI_EXP_DEVCAP_FLR)) + pci_try_reset_function(vdev->pdev); + } + + return count; +} + /* Permissions for PCI Express capability */ static int __init init_pci_cap_exp_perm(struct perm_bits *perm) { @@ -759,26 +793,64 @@ static int __init init_pci_cap_exp_perm(struct perm_bits *perm) if (alloc_perm_bits(perm, PCI_CAP_EXP_ENDPOINT_SIZEOF_V2)) return -ENOMEM; + perm->writefn = vfio_exp_config_write; + p_setb(perm, PCI_CAP_LIST_NEXT, (u8)ALL_VIRT, NO_WRITE); /* - * Allow writes to device control fields (includes FLR!) - * but not to devctl_phantom which could confuse IOMMU - * or to the ARI bit in devctl2 which is set at probe time + * Allow writes to device control fields, except devctl_phantom, + * which could confuse IOMMU, and the ARI bit in devctl2, which + * is set at probe time. FLR gets virtualized via our writefn. */ - p_setw(perm, PCI_EXP_DEVCTL, NO_VIRT, ~PCI_EXP_DEVCTL_PHANTOM); + p_setw(perm, PCI_EXP_DEVCTL, + PCI_EXP_DEVCTL_BCR_FLR, ~PCI_EXP_DEVCTL_PHANTOM); p_setw(perm, PCI_EXP_DEVCTL2, NO_VIRT, ~PCI_EXP_DEVCTL2_ARI); return 0; } +static int vfio_af_config_write(struct vfio_pci_device *vdev, int pos, + int count, struct perm_bits *perm, + int offset, __le32 val) +{ + u8 *ctrl = vdev->vconfig + pos - offset + PCI_AF_CTRL; + + count = vfio_default_config_write(vdev, pos, count, perm, offset, val); + if (count < 0) + return count; + + /* + * The FLR bit is virtualized, if set and the device supports AF + * FLR, issue a reset_function. Regardless, clear the bit, the spec + * requires it to be always read as zero. NB, reset_function might + * not use an AF FLR, we don't have that level of granularity. + */ + if (*ctrl & PCI_AF_CTRL_FLR) { + u8 cap; + int ret; + + *ctrl &= ~PCI_AF_CTRL_FLR; + + ret = pci_user_read_config_byte(vdev->pdev, + pos - offset + PCI_AF_CAP, + &cap); + + if (!ret && (cap & PCI_AF_CAP_FLR) && (cap & PCI_AF_CAP_TP)) + pci_try_reset_function(vdev->pdev); + } + + return count; +} + /* Permissions for Advanced Function capability */ static int __init init_pci_cap_af_perm(struct perm_bits *perm) { if (alloc_perm_bits(perm, pci_cap_length[PCI_CAP_ID_AF])) return -ENOMEM; + perm->writefn = vfio_af_config_write; + p_setb(perm, PCI_CAP_LIST_NEXT, (u8)ALL_VIRT, NO_WRITE); - p_setb(perm, PCI_AF_CTRL, NO_VIRT, PCI_AF_CTRL_FLR); + p_setb(perm, PCI_AF_CTRL, PCI_AF_CTRL_FLR, PCI_AF_CTRL_FLR); return 0; } From 56af20b853c3e224988f725a0012d8f964c96203 Mon Sep 17 00:00:00 2001 From: Alex Williamson Date: Mon, 2 Oct 2017 12:39:09 -0600 Subject: [PATCH 070/146] vfio/pci: Virtualize Maximum Payload Size commit 523184972b282cd9ca17a76f6ca4742394856818 upstream. With virtual PCI-Express chipsets, we now see userspace/guest drivers trying to match the physical MPS setting to a virtual downstream port. Of course a lone physical device surrounded by virtual interconnects cannot make a correct decision for a proper MPS setting. Instead, let's virtualize the MPS control register so that writes through to hardware are disallowed. Userspace drivers like QEMU assume they can write anything to the device and we'll filter out anything dangerous. Since mismatched MPS can lead to AER and other faults, let's add it to the kernel side rather than relying on userspace virtualization to handle it. Signed-off-by: Alex Williamson Reviewed-by: Eric Auger Signed-off-by: Greg Kroah-Hartman --- drivers/vfio/pci/vfio_pci_config.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/drivers/vfio/pci/vfio_pci_config.c b/drivers/vfio/pci/vfio_pci_config.c index 7a6b85f76..90c4fd2b3 100644 --- a/drivers/vfio/pci/vfio_pci_config.c +++ b/drivers/vfio/pci/vfio_pci_config.c @@ -799,11 +799,13 @@ static int __init init_pci_cap_exp_perm(struct perm_bits *perm) /* * Allow writes to device control fields, except devctl_phantom, - * which could confuse IOMMU, and the ARI bit in devctl2, which + * which could confuse IOMMU, MPS, which can break communication + * with other physical devices, and the ARI bit in devctl2, which * is set at probe time. FLR gets virtualized via our writefn. */ p_setw(perm, PCI_EXP_DEVCTL, - PCI_EXP_DEVCTL_BCR_FLR, ~PCI_EXP_DEVCTL_PHANTOM); + PCI_EXP_DEVCTL_BCR_FLR | PCI_EXP_DEVCTL_PAYLOAD, + ~PCI_EXP_DEVCTL_PHANTOM); p_setw(perm, PCI_EXP_DEVCTL2, NO_VIRT, ~PCI_EXP_DEVCTL2_ARI); return 0; } From a4a4501a19380d9ce49144e3426bd1907c32b77b Mon Sep 17 00:00:00 2001 From: Alex Williamson Date: Mon, 2 Oct 2017 12:39:10 -0600 Subject: [PATCH 071/146] vfio/pci: Virtualize Maximum Read Request Size commit cf0d53ba4947aad6e471491d5b20a567cbe92e56 upstream. MRRS defines the maximum read request size a device is allowed to make. Drivers will often increase this to allow more data transfer with a single request. Completions to this request are bound by the MPS setting for the bus. Aside from device quirks (none known), it doesn't seem to make sense to set an MRRS value less than MPS, yet this is a likely scenario given that user drivers do not have a system-wide view of the PCI topology. Virtualize MRRS such that the user can set MRRS >= MPS, but use MPS as the floor value that we'll write to hardware. Signed-off-by: Alex Williamson Signed-off-by: Greg Kroah-Hartman --- drivers/vfio/pci/vfio_pci_config.c | 29 ++++++++++++++++++++++++++--- 1 file changed, 26 insertions(+), 3 deletions(-) diff --git a/drivers/vfio/pci/vfio_pci_config.c b/drivers/vfio/pci/vfio_pci_config.c index 90c4fd2b3..c55c632a3 100644 --- a/drivers/vfio/pci/vfio_pci_config.c +++ b/drivers/vfio/pci/vfio_pci_config.c @@ -758,6 +758,7 @@ static int vfio_exp_config_write(struct vfio_pci_device *vdev, int pos, { __le16 *ctrl = (__le16 *)(vdev->vconfig + pos - offset + PCI_EXP_DEVCTL); + int readrq = le16_to_cpu(*ctrl) & PCI_EXP_DEVCTL_READRQ; count = vfio_default_config_write(vdev, pos, count, perm, offset, val); if (count < 0) @@ -783,6 +784,27 @@ static int vfio_exp_config_write(struct vfio_pci_device *vdev, int pos, pci_try_reset_function(vdev->pdev); } + /* + * MPS is virtualized to the user, writes do not change the physical + * register since determining a proper MPS value requires a system wide + * device view. The MRRS is largely independent of MPS, but since the + * user does not have that system-wide view, they might set a safe, but + * inefficiently low value. Here we allow writes through to hardware, + * but we set the floor to the physical device MPS setting, so that + * we can at least use full TLPs, as defined by the MPS value. + * + * NB, if any devices actually depend on an artificially low MRRS + * setting, this will need to be revisited, perhaps with a quirk + * though pcie_set_readrq(). + */ + if (readrq != (le16_to_cpu(*ctrl) & PCI_EXP_DEVCTL_READRQ)) { + readrq = 128 << + ((le16_to_cpu(*ctrl) & PCI_EXP_DEVCTL_READRQ) >> 12); + readrq = max(readrq, pcie_get_mps(vdev->pdev)); + + pcie_set_readrq(vdev->pdev, readrq); + } + return count; } @@ -801,11 +823,12 @@ static int __init init_pci_cap_exp_perm(struct perm_bits *perm) * Allow writes to device control fields, except devctl_phantom, * which could confuse IOMMU, MPS, which can break communication * with other physical devices, and the ARI bit in devctl2, which - * is set at probe time. FLR gets virtualized via our writefn. + * is set at probe time. FLR and MRRS get virtualized via our + * writefn. */ p_setw(perm, PCI_EXP_DEVCTL, - PCI_EXP_DEVCTL_BCR_FLR | PCI_EXP_DEVCTL_PAYLOAD, - ~PCI_EXP_DEVCTL_PHANTOM); + PCI_EXP_DEVCTL_BCR_FLR | PCI_EXP_DEVCTL_PAYLOAD | + PCI_EXP_DEVCTL_READRQ, ~PCI_EXP_DEVCTL_PHANTOM); p_setw(perm, PCI_EXP_DEVCTL2, NO_VIRT, ~PCI_EXP_DEVCTL2_ARI); return 0; } From 35a1c426825a24e3dadf0d8d3236ddab1f1adcff Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Thu, 29 Mar 2018 22:10:35 -0400 Subject: [PATCH 072/146] ext4: don't allow r/w mounts if metadata blocks overlap the superblock commit 18db4b4e6fc31eda838dd1c1296d67dbcb3dc957 upstream. If some metadata block, such as an allocation bitmap, overlaps the superblock, it's very likely that if the file system is mounted read/write, the results will not be pretty. So disallow r/w mounts for file systems corrupted in this particular way. Backport notes: 3.18.y is missing bc98a42c1f7d ("VFS: Convert sb->s_flags & MS_RDONLY to sb_rdonly(sb)") and e462ec50cb5f ("VFS: Differentiate mount flags (MS_*) from internal superblock flags") so we simply use the sb MS_RDONLY check from pre bc98a42c1f7d in place of the sb_rdonly function used in the upstream variant of the patch. Signed-off-by: Theodore Ts'o Cc: stable@vger.kernel.org Signed-off-by: Harsh Shandilya Signed-off-by: Greg Kroah-Hartman --- fs/ext4/super.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/fs/ext4/super.c b/fs/ext4/super.c index bc79e2ca4..8cff133ff 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -2130,6 +2130,8 @@ static int ext4_check_descriptors(struct super_block *sb, ext4_msg(sb, KERN_ERR, "ext4_check_descriptors: " "Block bitmap for group %u overlaps " "superblock", i); + if (!(sb->s_flags & MS_RDONLY)) + return 0; } if (block_bitmap < first_block || block_bitmap > last_block) { ext4_msg(sb, KERN_ERR, "ext4_check_descriptors: " @@ -2142,6 +2144,8 @@ static int ext4_check_descriptors(struct super_block *sb, ext4_msg(sb, KERN_ERR, "ext4_check_descriptors: " "Inode bitmap for group %u overlaps " "superblock", i); + if (!(sb->s_flags & MS_RDONLY)) + return 0; } if (inode_bitmap < first_block || inode_bitmap > last_block) { ext4_msg(sb, KERN_ERR, "ext4_check_descriptors: " @@ -2154,6 +2158,8 @@ static int ext4_check_descriptors(struct super_block *sb, ext4_msg(sb, KERN_ERR, "ext4_check_descriptors: " "Inode table for group %u overlaps " "superblock", i); + if (!(sb->s_flags & MS_RDONLY)) + return 0; } if (inode_table < first_block || inode_table + sbi->s_itb_per_group - 1 > last_block) { From 7a1210311e7b56327c144037a4e7115970d88f9c Mon Sep 17 00:00:00 2001 From: Paul Parsons Date: Sat, 2 Apr 2016 12:32:30 +0100 Subject: [PATCH 073/146] drm/radeon: Fix PCIe lane width calculation MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit commit 85e290d92b4b794d0c758c53007eb4248d385386 upstream. Two years ago I tried an AMD Radeon E8860 embedded GPU with the drm driver. The dmesg output included driver warnings about an invalid PCIe lane width. Tracking the problem back led to si_set_pcie_lane_width_in_smc(). The calculation of the lane widths via ATOM_PPLIB_PCIE_LINK_WIDTH_MASK and ATOM_PPLIB_PCIE_LINK_WIDTH_SHIFT macros did not increment the resulting value, per the comment in pptable.h ("lanes - 1"), and per usage elsewhere. Applying the increment silenced the warnings. The code has not changed since, so either my analysis was incorrect or the bug has gone unnoticed. Hence submitting this as an RFC. Acked-by: Christian König Acked-by: Chunming Zhou Signed-off-by: Paul Parsons Signed-off-by: Alex Deucher Cc: stable@vger.kernel.org Signed-off-by: Greg Kroah-Hartman --- drivers/gpu/drm/radeon/si_dpm.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/gpu/drm/radeon/si_dpm.c b/drivers/gpu/drm/radeon/si_dpm.c index d9007cc37..892d0a71d 100644 --- a/drivers/gpu/drm/radeon/si_dpm.c +++ b/drivers/gpu/drm/radeon/si_dpm.c @@ -5964,9 +5964,9 @@ static void si_set_pcie_lane_width_in_smc(struct radeon_device *rdev, { u32 lane_width; u32 new_lane_width = - (radeon_new_state->caps & ATOM_PPLIB_PCIE_LINK_WIDTH_MASK) >> ATOM_PPLIB_PCIE_LINK_WIDTH_SHIFT; + ((radeon_new_state->caps & ATOM_PPLIB_PCIE_LINK_WIDTH_MASK) >> ATOM_PPLIB_PCIE_LINK_WIDTH_SHIFT) + 1; u32 current_lane_width = - (radeon_current_state->caps & ATOM_PPLIB_PCIE_LINK_WIDTH_MASK) >> ATOM_PPLIB_PCIE_LINK_WIDTH_SHIFT; + ((radeon_current_state->caps & ATOM_PPLIB_PCIE_LINK_WIDTH_MASK) >> ATOM_PPLIB_PCIE_LINK_WIDTH_SHIFT) + 1; if (new_lane_width != current_lane_width) { radeon_set_pcie_lanes(rdev, new_lane_width); From 0181beadf851cf2c0a75a162dff77b311175c456 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Fri, 19 Feb 2016 00:33:21 -0500 Subject: [PATCH 074/146] ext4: fix crashes in dioread_nolock mode commit 74dae4278546b897eb81784fdfcce872ddd8b2b8 upstream. Competing overwrite DIO in dioread_nolock mode will just overwrite pointer to io_end in the inode. This may result in data corruption or extent conversion happening from IO completion interrupt because we don't properly set buffer_defer_completion() when unlocked DIO races with locked DIO to unwritten extent. Since unlocked DIO doesn't need io_end for anything, just avoid allocating it and corrupting pointer from inode for locked DIO. A cleaner fix would be to avoid these games with io_end pointer from the inode but that requires more intrusive changes so we leave that for later. Cc: stable@vger.kernel.org Signed-off-by: Jan Kara Signed-off-by: Theodore Ts'o Signed-off-by: Nathan Chancellor Signed-off-by: Greg Kroah-Hartman --- fs/ext4/inode.c | 40 ++++++++++++++++++++-------------------- 1 file changed, 20 insertions(+), 20 deletions(-) diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index ca9a7d98d..04aed40ae 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -3279,29 +3279,29 @@ static ssize_t ext4_ext_direct_IO(struct kiocb *iocb, struct iov_iter *iter, * case, we allocate an io_end structure to hook to the iocb. */ iocb->private = NULL; - ext4_inode_aio_set(inode, NULL); - if (!is_sync_kiocb(iocb)) { - io_end = ext4_init_io_end(inode, GFP_NOFS); - if (!io_end) { - ret = -ENOMEM; - goto retake_lock; - } - /* - * Grab reference for DIO. Will be dropped in ext4_end_io_dio() - */ - iocb->private = ext4_get_io_end(io_end); - /* - * we save the io structure for current async direct - * IO, so that later ext4_map_blocks() could flag the - * io structure whether there is a unwritten extents - * needs to be converted when IO is completed. - */ - ext4_inode_aio_set(inode, io_end); - } - if (overwrite) { get_block_func = ext4_get_block_write_nolock; } else { + ext4_inode_aio_set(inode, NULL); + if (!is_sync_kiocb(iocb)) { + io_end = ext4_init_io_end(inode, GFP_NOFS); + if (!io_end) { + ret = -ENOMEM; + goto retake_lock; + } + /* + * Grab reference for DIO. Will be dropped in + * ext4_end_io_dio() + */ + iocb->private = ext4_get_io_end(io_end); + /* + * we save the io structure for current async direct + * IO, so that later ext4_map_blocks() could flag the + * io structure whether there is a unwritten extents + * needs to be converted when IO is completed. + */ + ext4_inode_aio_set(inode, io_end); + } get_block_func = ext4_get_block_write; dio_flags = DIO_LOCKING; } From c8d619e5181da49fd61214c4ab84c51c1b24f91c Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Wed, 11 Jan 2017 21:50:46 -0500 Subject: [PATCH 075/146] ext4: fix deadlock between inline_data and ext4_expand_extra_isize_ea() commit c755e251357a0cee0679081f08c3f4ba797a8009 upstream. The xattr_sem deadlock problems fixed in commit 2e81a4eeedca: "ext4: avoid deadlock when expanding inode size" didn't include the use of xattr_sem in fs/ext4/inline.c. With the addition of project quota which added a new extra inode field, this exposed deadlocks in the inline_data code similar to the ones fixed by 2e81a4eeedca. The deadlock can be reproduced via: dmesg -n 7 mke2fs -t ext4 -O inline_data -Fq -I 256 /dev/vdc 32768 mount -t ext4 -o debug_want_extra_isize=24 /dev/vdc /vdc mkdir /vdc/a umount /vdc mount -t ext4 /dev/vdc /vdc echo foo > /vdc/a/foo and looks like this: [ 11.158815] [ 11.160276] ============================================= [ 11.161960] [ INFO: possible recursive locking detected ] [ 11.161960] 4.10.0-rc3-00015-g011b30a8a3cf #160 Tainted: G W [ 11.161960] --------------------------------------------- [ 11.161960] bash/2519 is trying to acquire lock: [ 11.161960] (&ei->xattr_sem){++++..}, at: [] ext4_expand_extra_isize_ea+0x3d/0x4cd [ 11.161960] [ 11.161960] but task is already holding lock: [ 11.161960] (&ei->xattr_sem){++++..}, at: [] ext4_try_add_inline_entry+0x3a/0x152 [ 11.161960] [ 11.161960] other info that might help us debug this: [ 11.161960] Possible unsafe locking scenario: [ 11.161960] [ 11.161960] CPU0 [ 11.161960] ---- [ 11.161960] lock(&ei->xattr_sem); [ 11.161960] lock(&ei->xattr_sem); [ 11.161960] [ 11.161960] *** DEADLOCK *** [ 11.161960] [ 11.161960] May be due to missing lock nesting notation [ 11.161960] [ 11.161960] 4 locks held by bash/2519: [ 11.161960] #0: (sb_writers#3){.+.+.+}, at: [] mnt_want_write+0x1e/0x3e [ 11.161960] #1: (&type->i_mutex_dir_key){++++++}, at: [] path_openat+0x338/0x67a [ 11.161960] #2: (jbd2_handle){++++..}, at: [] start_this_handle+0x582/0x622 [ 11.161960] #3: (&ei->xattr_sem){++++..}, at: [] ext4_try_add_inline_entry+0x3a/0x152 [ 11.161960] [ 11.161960] stack backtrace: [ 11.161960] CPU: 0 PID: 2519 Comm: bash Tainted: G W 4.10.0-rc3-00015-g011b30a8a3cf #160 [ 11.161960] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.10.1-1 04/01/2014 [ 11.161960] Call Trace: [ 11.161960] dump_stack+0x72/0xa3 [ 11.161960] __lock_acquire+0xb7c/0xcb9 [ 11.161960] ? kvm_clock_read+0x1f/0x29 [ 11.161960] ? __lock_is_held+0x36/0x66 [ 11.161960] ? __lock_is_held+0x36/0x66 [ 11.161960] lock_acquire+0x106/0x18a [ 11.161960] ? ext4_expand_extra_isize_ea+0x3d/0x4cd [ 11.161960] down_write+0x39/0x72 [ 11.161960] ? ext4_expand_extra_isize_ea+0x3d/0x4cd [ 11.161960] ext4_expand_extra_isize_ea+0x3d/0x4cd [ 11.161960] ? _raw_read_unlock+0x22/0x2c [ 11.161960] ? jbd2_journal_extend+0x1e2/0x262 [ 11.161960] ? __ext4_journal_get_write_access+0x3d/0x60 [ 11.161960] ext4_mark_inode_dirty+0x17d/0x26d [ 11.161960] ? ext4_add_dirent_to_inline.isra.12+0xa5/0xb2 [ 11.161960] ext4_add_dirent_to_inline.isra.12+0xa5/0xb2 [ 11.161960] ext4_try_add_inline_entry+0x69/0x152 [ 11.161960] ext4_add_entry+0xa3/0x848 [ 11.161960] ? __brelse+0x14/0x2f [ 11.161960] ? _raw_spin_unlock_irqrestore+0x44/0x4f [ 11.161960] ext4_add_nondir+0x17/0x5b [ 11.161960] ext4_create+0xcf/0x133 [ 11.161960] ? ext4_mknod+0x12f/0x12f [ 11.161960] lookup_open+0x39e/0x3fb [ 11.161960] ? __wake_up+0x1a/0x40 [ 11.161960] ? lock_acquire+0x11e/0x18a [ 11.161960] path_openat+0x35c/0x67a [ 11.161960] ? sched_clock_cpu+0xd7/0xf2 [ 11.161960] do_filp_open+0x36/0x7c [ 11.161960] ? _raw_spin_unlock+0x22/0x2c [ 11.161960] ? __alloc_fd+0x169/0x173 [ 11.161960] do_sys_open+0x59/0xcc [ 11.161960] SyS_open+0x1d/0x1f [ 11.161960] do_int80_syscall_32+0x4f/0x61 [ 11.161960] entry_INT80_32+0x2f/0x2f [ 11.161960] EIP: 0xb76ad469 [ 11.161960] EFLAGS: 00000286 CPU: 0 [ 11.161960] EAX: ffffffda EBX: 08168ac8 ECX: 00008241 EDX: 000001b6 [ 11.161960] ESI: b75e46bc EDI: b7755000 EBP: bfbdb108 ESP: bfbdafc0 [ 11.161960] DS: 007b ES: 007b FS: 0000 GS: 0033 SS: 007b Cc: stable@vger.kernel.org # 3.10 (requires 2e81a4eeedca as a prereq) Reported-by: George Spelvin Signed-off-by: Theodore Ts'o Signed-off-by: Nathan Chancellor Signed-off-by: Greg Kroah-Hartman --- fs/ext4/inline.c | 66 ++++++++++++++++++++++-------------------------- fs/ext4/xattr.c | 30 +++++++++------------- fs/ext4/xattr.h | 32 +++++++++++++++++++++++ 3 files changed, 74 insertions(+), 54 deletions(-) diff --git a/fs/ext4/inline.c b/fs/ext4/inline.c index 84da8fd0a..ae003b453 100644 --- a/fs/ext4/inline.c +++ b/fs/ext4/inline.c @@ -377,7 +377,7 @@ static int ext4_update_inline_data(handle_t *handle, struct inode *inode, static int ext4_prepare_inline_data(handle_t *handle, struct inode *inode, unsigned int len) { - int ret, size; + int ret, size, no_expand; struct ext4_inode_info *ei = EXT4_I(inode); if (!ext4_test_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA)) @@ -387,15 +387,14 @@ static int ext4_prepare_inline_data(handle_t *handle, struct inode *inode, if (size < len) return -ENOSPC; - down_write(&EXT4_I(inode)->xattr_sem); + ext4_write_lock_xattr(inode, &no_expand); if (ei->i_inline_off) ret = ext4_update_inline_data(handle, inode, len); else ret = ext4_create_inline_data(handle, inode, len); - up_write(&EXT4_I(inode)->xattr_sem); - + ext4_write_unlock_xattr(inode, &no_expand); return ret; } @@ -529,7 +528,7 @@ static int ext4_convert_inline_data_to_extent(struct address_space *mapping, struct inode *inode, unsigned flags) { - int ret, needed_blocks; + int ret, needed_blocks, no_expand; handle_t *handle = NULL; int retries = 0, sem_held = 0; struct page *page = NULL; @@ -569,7 +568,7 @@ static int ext4_convert_inline_data_to_extent(struct address_space *mapping, goto out; } - down_write(&EXT4_I(inode)->xattr_sem); + ext4_write_lock_xattr(inode, &no_expand); sem_held = 1; /* If some one has already done this for us, just exit. */ if (!ext4_has_inline_data(inode)) { @@ -605,7 +604,7 @@ static int ext4_convert_inline_data_to_extent(struct address_space *mapping, page_cache_release(page); page = NULL; ext4_orphan_add(handle, inode); - up_write(&EXT4_I(inode)->xattr_sem); + ext4_write_unlock_xattr(inode, &no_expand); sem_held = 0; ext4_journal_stop(handle); handle = NULL; @@ -631,7 +630,7 @@ static int ext4_convert_inline_data_to_extent(struct address_space *mapping, page_cache_release(page); } if (sem_held) - up_write(&EXT4_I(inode)->xattr_sem); + ext4_write_unlock_xattr(inode, &no_expand); if (handle) ext4_journal_stop(handle); brelse(iloc.bh); @@ -724,7 +723,7 @@ int ext4_try_to_write_inline_data(struct address_space *mapping, int ext4_write_inline_data_end(struct inode *inode, loff_t pos, unsigned len, unsigned copied, struct page *page) { - int ret; + int ret, no_expand; void *kaddr; struct ext4_iloc iloc; @@ -742,7 +741,7 @@ int ext4_write_inline_data_end(struct inode *inode, loff_t pos, unsigned len, goto out; } - down_write(&EXT4_I(inode)->xattr_sem); + ext4_write_lock_xattr(inode, &no_expand); BUG_ON(!ext4_has_inline_data(inode)); kaddr = kmap_atomic(page); @@ -752,7 +751,7 @@ int ext4_write_inline_data_end(struct inode *inode, loff_t pos, unsigned len, /* clear page dirty so that writepages wouldn't work for us. */ ClearPageDirty(page); - up_write(&EXT4_I(inode)->xattr_sem); + ext4_write_unlock_xattr(inode, &no_expand); brelse(iloc.bh); out: return copied; @@ -763,7 +762,7 @@ ext4_journalled_write_inline_data(struct inode *inode, unsigned len, struct page *page) { - int ret; + int ret, no_expand; void *kaddr; struct ext4_iloc iloc; @@ -773,11 +772,11 @@ ext4_journalled_write_inline_data(struct inode *inode, return NULL; } - down_write(&EXT4_I(inode)->xattr_sem); + ext4_write_lock_xattr(inode, &no_expand); kaddr = kmap_atomic(page); ext4_write_inline_data(inode, &iloc, kaddr, 0, len); kunmap_atomic(kaddr); - up_write(&EXT4_I(inode)->xattr_sem); + ext4_write_unlock_xattr(inode, &no_expand); return iloc.bh; } @@ -1261,7 +1260,7 @@ static int ext4_convert_inline_data_nolock(handle_t *handle, int ext4_try_add_inline_entry(handle_t *handle, struct ext4_filename *fname, struct dentry *dentry, struct inode *inode) { - int ret, inline_size; + int ret, inline_size, no_expand; void *inline_start; struct ext4_iloc iloc; struct inode *dir = d_inode(dentry->d_parent); @@ -1270,7 +1269,7 @@ int ext4_try_add_inline_entry(handle_t *handle, struct ext4_filename *fname, if (ret) return ret; - down_write(&EXT4_I(dir)->xattr_sem); + ext4_write_lock_xattr(dir, &no_expand); if (!ext4_has_inline_data(dir)) goto out; @@ -1316,7 +1315,7 @@ int ext4_try_add_inline_entry(handle_t *handle, struct ext4_filename *fname, out: ext4_mark_inode_dirty(handle, dir); - up_write(&EXT4_I(dir)->xattr_sem); + ext4_write_unlock_xattr(dir, &no_expand); brelse(iloc.bh); return ret; } @@ -1676,7 +1675,7 @@ int ext4_delete_inline_entry(handle_t *handle, struct buffer_head *bh, int *has_inline_data) { - int err, inline_size; + int err, inline_size, no_expand; struct ext4_iloc iloc; void *inline_start; @@ -1684,7 +1683,7 @@ int ext4_delete_inline_entry(handle_t *handle, if (err) return err; - down_write(&EXT4_I(dir)->xattr_sem); + ext4_write_lock_xattr(dir, &no_expand); if (!ext4_has_inline_data(dir)) { *has_inline_data = 0; goto out; @@ -1719,7 +1718,7 @@ int ext4_delete_inline_entry(handle_t *handle, ext4_show_inline_dir(dir, iloc.bh, inline_start, inline_size); out: - up_write(&EXT4_I(dir)->xattr_sem); + ext4_write_unlock_xattr(dir, &no_expand); brelse(iloc.bh); if (err != -ENOENT) ext4_std_error(dir->i_sb, err); @@ -1818,11 +1817,11 @@ int empty_inline_dir(struct inode *dir, int *has_inline_data) int ext4_destroy_inline_data(handle_t *handle, struct inode *inode) { - int ret; + int ret, no_expand; - down_write(&EXT4_I(inode)->xattr_sem); + ext4_write_lock_xattr(inode, &no_expand); ret = ext4_destroy_inline_data_nolock(handle, inode); - up_write(&EXT4_I(inode)->xattr_sem); + ext4_write_unlock_xattr(inode, &no_expand); return ret; } @@ -1907,7 +1906,7 @@ int ext4_try_to_evict_inline_data(handle_t *handle, void ext4_inline_data_truncate(struct inode *inode, int *has_inline) { handle_t *handle; - int inline_size, value_len, needed_blocks; + int inline_size, value_len, needed_blocks, no_expand; size_t i_size; void *value = NULL; struct ext4_xattr_ibody_find is = { @@ -1924,7 +1923,7 @@ void ext4_inline_data_truncate(struct inode *inode, int *has_inline) if (IS_ERR(handle)) return; - down_write(&EXT4_I(inode)->xattr_sem); + ext4_write_lock_xattr(inode, &no_expand); if (!ext4_has_inline_data(inode)) { *has_inline = 0; ext4_journal_stop(handle); @@ -1982,7 +1981,7 @@ void ext4_inline_data_truncate(struct inode *inode, int *has_inline) up_write(&EXT4_I(inode)->i_data_sem); out: brelse(is.iloc.bh); - up_write(&EXT4_I(inode)->xattr_sem); + ext4_write_unlock_xattr(inode, &no_expand); kfree(value); if (inode->i_nlink) ext4_orphan_del(handle, inode); @@ -1998,7 +1997,7 @@ void ext4_inline_data_truncate(struct inode *inode, int *has_inline) int ext4_convert_inline_data(struct inode *inode) { - int error, needed_blocks; + int error, needed_blocks, no_expand; handle_t *handle; struct ext4_iloc iloc; @@ -2020,15 +2019,10 @@ int ext4_convert_inline_data(struct inode *inode) goto out_free; } - down_write(&EXT4_I(inode)->xattr_sem); - if (!ext4_has_inline_data(inode)) { - up_write(&EXT4_I(inode)->xattr_sem); - goto out; - } - - error = ext4_convert_inline_data_nolock(handle, inode, &iloc); - up_write(&EXT4_I(inode)->xattr_sem); -out: + ext4_write_lock_xattr(inode, &no_expand); + if (ext4_has_inline_data(inode)) + error = ext4_convert_inline_data_nolock(handle, inode, &iloc); + ext4_write_unlock_xattr(inode, &no_expand); ext4_journal_stop(handle); out_free: brelse(iloc.bh); diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c index c356b4954..b16bfb52e 100644 --- a/fs/ext4/xattr.c +++ b/fs/ext4/xattr.c @@ -1143,16 +1143,14 @@ ext4_xattr_set_handle(handle_t *handle, struct inode *inode, int name_index, struct ext4_xattr_block_find bs = { .s = { .not_found = -ENODATA, }, }; - unsigned long no_expand; + int no_expand; int error; if (!name) return -EINVAL; if (strlen(name) > 255) return -ERANGE; - down_write(&EXT4_I(inode)->xattr_sem); - no_expand = ext4_test_inode_state(inode, EXT4_STATE_NO_EXPAND); - ext4_set_inode_state(inode, EXT4_STATE_NO_EXPAND); + ext4_write_lock_xattr(inode, &no_expand); error = ext4_reserve_inode_write(handle, inode, &is.iloc); if (error) @@ -1213,7 +1211,7 @@ ext4_xattr_set_handle(handle_t *handle, struct inode *inode, int name_index, ext4_xattr_update_super_block(handle, inode->i_sb); inode->i_ctime = ext4_current_time(inode); if (!value) - ext4_clear_inode_state(inode, EXT4_STATE_NO_EXPAND); + no_expand = 0; error = ext4_mark_iloc_dirty(handle, inode, &is.iloc); /* * The bh is consumed by ext4_mark_iloc_dirty, even with @@ -1227,9 +1225,7 @@ ext4_xattr_set_handle(handle_t *handle, struct inode *inode, int name_index, cleanup: brelse(is.iloc.bh); brelse(bs.bh); - if (no_expand == 0) - ext4_clear_inode_state(inode, EXT4_STATE_NO_EXPAND); - up_write(&EXT4_I(inode)->xattr_sem); + ext4_write_unlock_xattr(inode, &no_expand); return error; } @@ -1313,12 +1309,11 @@ int ext4_expand_extra_isize_ea(struct inode *inode, int new_extra_isize, int error = 0, tried_min_extra_isize = 0; int s_min_extra_isize = le16_to_cpu(EXT4_SB(inode->i_sb)->s_es->s_min_extra_isize); int isize_diff; /* How much do we need to grow i_extra_isize */ + int no_expand; + + if (ext4_write_trylock_xattr(inode, &no_expand) == 0) + return 0; - down_write(&EXT4_I(inode)->xattr_sem); - /* - * Set EXT4_STATE_NO_EXPAND to avoid recursion when marking inode dirty - */ - ext4_set_inode_state(inode, EXT4_STATE_NO_EXPAND); retry: isize_diff = new_extra_isize - EXT4_I(inode)->i_extra_isize; if (EXT4_I(inode)->i_extra_isize >= new_extra_isize) @@ -1512,8 +1507,7 @@ int ext4_expand_extra_isize_ea(struct inode *inode, int new_extra_isize, } brelse(bh); out: - ext4_clear_inode_state(inode, EXT4_STATE_NO_EXPAND); - up_write(&EXT4_I(inode)->xattr_sem); + ext4_write_unlock_xattr(inode, &no_expand); return 0; cleanup: @@ -1525,10 +1519,10 @@ int ext4_expand_extra_isize_ea(struct inode *inode, int new_extra_isize, kfree(bs); brelse(bh); /* - * We deliberately leave EXT4_STATE_NO_EXPAND set here since inode - * size expansion failed. + * Inode size expansion failed; don't try again */ - up_write(&EXT4_I(inode)->xattr_sem); + no_expand = 1; + ext4_write_unlock_xattr(inode, &no_expand); return error; } diff --git a/fs/ext4/xattr.h b/fs/ext4/xattr.h index 10b0f7323..cdc413476 100644 --- a/fs/ext4/xattr.h +++ b/fs/ext4/xattr.h @@ -101,6 +101,38 @@ extern const struct xattr_handler ext4_xattr_security_handler; #define EXT4_XATTR_NAME_ENCRYPTION_CONTEXT "c" +/* + * The EXT4_STATE_NO_EXPAND is overloaded and used for two purposes. + * The first is to signal that there the inline xattrs and data are + * taking up so much space that we might as well not keep trying to + * expand it. The second is that xattr_sem is taken for writing, so + * we shouldn't try to recurse into the inode expansion. For this + * second case, we need to make sure that we take save and restore the + * NO_EXPAND state flag appropriately. + */ +static inline void ext4_write_lock_xattr(struct inode *inode, int *save) +{ + down_write(&EXT4_I(inode)->xattr_sem); + *save = ext4_test_inode_state(inode, EXT4_STATE_NO_EXPAND); + ext4_set_inode_state(inode, EXT4_STATE_NO_EXPAND); +} + +static inline int ext4_write_trylock_xattr(struct inode *inode, int *save) +{ + if (down_write_trylock(&EXT4_I(inode)->xattr_sem) == 0) + return 0; + *save = ext4_test_inode_state(inode, EXT4_STATE_NO_EXPAND); + ext4_set_inode_state(inode, EXT4_STATE_NO_EXPAND); + return 1; +} + +static inline void ext4_write_unlock_xattr(struct inode *inode, int *save) +{ + if (*save == 0) + ext4_clear_inode_state(inode, EXT4_STATE_NO_EXPAND); + up_write(&EXT4_I(inode)->xattr_sem); +} + extern ssize_t ext4_listxattr(struct dentry *, char *, size_t); extern int ext4_xattr_get(struct inode *, int, const char *, void *, size_t); From 5311a5a9c1c824bbe2e3247af469c7d2cae7add6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Fabi=C3=A1n=20Inostroza?= Date: Thu, 12 Apr 2018 00:37:35 -0300 Subject: [PATCH 076/146] ALSA: line6: Use correct endpoint type for midi output MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit commit 7ecb46e9ee9af18e304eb9e7d6804c59a408e846 upstream. Sending MIDI messages to a PODxt through the USB connection shows "usb_submit_urb failed" in dmesg and the message is not received by the POD. The error is caused because in the funcion send_midi_async() in midi.c there is a call to usb_sndbulkpipe() for endpoint 3 OUT, but the PODxt USB descriptor shows that this endpoint it's an interrupt endpoint. Patch tested with PODxt only. [ The bug has been present from the very beginning in the staging driver time, but Fixes below points to the commit moving to sound/ directory so that the fix can be cleanly applied -- tiwai ] Fixes: 61864d844c29 ("ALSA: move line6 usb driver into sound/usb") Signed-off-by: Fabián Inostroza Cc: Signed-off-by: Takashi Iwai Signed-off-by: Greg Kroah-Hartman --- sound/usb/line6/midi.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sound/usb/line6/midi.c b/sound/usb/line6/midi.c index cebea9b7f..6a9be1df7 100644 --- a/sound/usb/line6/midi.c +++ b/sound/usb/line6/midi.c @@ -125,7 +125,7 @@ static int send_midi_async(struct usb_line6 *line6, unsigned char *data, } usb_fill_int_urb(urb, line6->usbdev, - usb_sndbulkpipe(line6->usbdev, + usb_sndintpipe(line6->usbdev, line6->properties->ep_ctrl_w), transfer_buffer, length, midi_sent, line6, line6->interval); From df87f5b07bae635a2dc54c896aa93a42ce19287a Mon Sep 17 00:00:00 2001 From: Takashi Iwai Date: Thu, 19 Apr 2018 18:16:15 +0200 Subject: [PATCH 077/146] ALSA: rawmidi: Fix missing input substream checks in compat ioctls commit 8a56ef4f3ffba9ebf4967b61ef600b0a7ba10f11 upstream. Some rawmidi compat ioctls lack of the input substream checks (although they do check only for rfile->output). This many eventually lead to an Oops as NULL substream is passed to the rawmidi core functions. Fix it by adding the proper checks before each function call. The bug was spotted by syzkaller. Reported-by: syzbot+f7a0348affc3b67bc617@syzkaller.appspotmail.com Cc: Signed-off-by: Takashi Iwai Signed-off-by: Greg Kroah-Hartman --- sound/core/rawmidi_compat.c | 18 ++++++++++++------ 1 file changed, 12 insertions(+), 6 deletions(-) diff --git a/sound/core/rawmidi_compat.c b/sound/core/rawmidi_compat.c index 09a89094d..4e304a249 100644 --- a/sound/core/rawmidi_compat.c +++ b/sound/core/rawmidi_compat.c @@ -36,8 +36,6 @@ static int snd_rawmidi_ioctl_params_compat(struct snd_rawmidi_file *rfile, struct snd_rawmidi_params params; unsigned int val; - if (rfile->output == NULL) - return -EINVAL; if (get_user(params.stream, &src->stream) || get_user(params.buffer_size, &src->buffer_size) || get_user(params.avail_min, &src->avail_min) || @@ -46,8 +44,12 @@ static int snd_rawmidi_ioctl_params_compat(struct snd_rawmidi_file *rfile, params.no_active_sensing = val; switch (params.stream) { case SNDRV_RAWMIDI_STREAM_OUTPUT: + if (!rfile->output) + return -EINVAL; return snd_rawmidi_output_params(rfile->output, ¶ms); case SNDRV_RAWMIDI_STREAM_INPUT: + if (!rfile->input) + return -EINVAL; return snd_rawmidi_input_params(rfile->input, ¶ms); } return -EINVAL; @@ -67,16 +69,18 @@ static int snd_rawmidi_ioctl_status_compat(struct snd_rawmidi_file *rfile, int err; struct snd_rawmidi_status status; - if (rfile->output == NULL) - return -EINVAL; if (get_user(status.stream, &src->stream)) return -EFAULT; switch (status.stream) { case SNDRV_RAWMIDI_STREAM_OUTPUT: + if (!rfile->output) + return -EINVAL; err = snd_rawmidi_output_status(rfile->output, &status); break; case SNDRV_RAWMIDI_STREAM_INPUT: + if (!rfile->input) + return -EINVAL; err = snd_rawmidi_input_status(rfile->input, &status); break; default: @@ -113,16 +117,18 @@ static int snd_rawmidi_ioctl_status_x32(struct snd_rawmidi_file *rfile, int err; struct snd_rawmidi_status status; - if (rfile->output == NULL) - return -EINVAL; if (get_user(status.stream, &src->stream)) return -EFAULT; switch (status.stream) { case SNDRV_RAWMIDI_STREAM_OUTPUT: + if (!rfile->output) + return -EINVAL; err = snd_rawmidi_output_status(rfile->output, &status); break; case SNDRV_RAWMIDI_STREAM_INPUT: + if (!rfile->input) + return -EINVAL; err = snd_rawmidi_input_status(rfile->input, &status); break; default: From 67af5116bc907db1fcfd5350b029dbe030076aaf Mon Sep 17 00:00:00 2001 From: David Wang Date: Mon, 16 Apr 2018 17:48:09 +0800 Subject: [PATCH 078/146] ALSA: hda - New VIA controller suppor no-snoop path commit af52f9982e410edac21ca4b49563053ffc9da1eb upstream. This patch is used to tell kernel that new VIA HDAC controller also support no-snoop path. [ minor coding style fix by tiwai ] Signed-off-by: David Wang Cc: Signed-off-by: Takashi Iwai Signed-off-by: Greg Kroah-Hartman --- sound/pci/hda/hda_intel.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/sound/pci/hda/hda_intel.c b/sound/pci/hda/hda_intel.c index fbd00821e..3be91696a 100644 --- a/sound/pci/hda/hda_intel.c +++ b/sound/pci/hda/hda_intel.c @@ -1549,7 +1549,8 @@ static void azx_check_snoop_available(struct azx *chip) */ u8 val; pci_read_config_byte(chip->pci, 0x42, &val); - if (!(val & 0x80) && chip->pci->revision == 0x30) + if (!(val & 0x80) && (chip->pci->revision == 0x30 || + chip->pci->revision == 0x20)) snoop = false; } From 293c9382788d527bcb13cb37c13dfd13891c5aa6 Mon Sep 17 00:00:00 2001 From: Rodrigo Rivas Costa Date: Fri, 6 Apr 2018 01:09:36 +0200 Subject: [PATCH 079/146] HID: hidraw: Fix crash on HIDIOCGFEATURE with a destroyed device commit a955358d54695e4ad9f7d6489a7ac4d69a8fc711 upstream. Doing `ioctl(HIDIOCGFEATURE)` in a tight loop on a hidraw device and then disconnecting the device, or unloading the driver, can cause a NULL pointer dereference. When a hidraw device is destroyed it sets 0 to `dev->exist`. Most functions check 'dev->exist' before doing its work, but `hidraw_get_report()` was missing that check. Cc: stable@vger.kernel.org Signed-off-by: Rodrigo Rivas Costa Signed-off-by: Jiri Kosina Signed-off-by: Greg Kroah-Hartman --- drivers/hid/hidraw.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/drivers/hid/hidraw.c b/drivers/hid/hidraw.c index 9ab1f5b6b..e10fe61c1 100644 --- a/drivers/hid/hidraw.c +++ b/drivers/hid/hidraw.c @@ -197,6 +197,11 @@ static ssize_t hidraw_get_report(struct file *file, char __user *buffer, size_t int ret = 0, len; unsigned char report_number; + if (!hidraw_table[minor] || !hidraw_table[minor]->exist) { + ret = -ENODEV; + goto out; + } + dev = hidraw_table[minor]->hid; if (!dev->ll_driver->raw_request) { From 8ef0c74a147e5d5add0f29ceb7580d9a9a7e13b1 Mon Sep 17 00:00:00 2001 From: Matt Redfearn Date: Tue, 17 Apr 2018 16:40:01 +0100 Subject: [PATCH 080/146] MIPS: uaccess: Add micromips clobbers to bzero invocation commit b3d7e55c3f886493235bfee08e1e5a4a27cbcce8 upstream. The micromips implementation of bzero additionally clobbers registers t7 & t8. Specify this in the clobbers list when invoking bzero. Fixes: 26c5e07d1478 ("MIPS: microMIPS: Optimise 'memset' core library function.") Reported-by: James Hogan Signed-off-by: Matt Redfearn Cc: Ralf Baechle Cc: linux-mips@linux-mips.org Cc: # 3.10+ Patchwork: https://patchwork.linux-mips.org/patch/19110/ Signed-off-by: James Hogan Signed-off-by: Greg Kroah-Hartman --- arch/mips/include/asm/uaccess.h | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/arch/mips/include/asm/uaccess.h b/arch/mips/include/asm/uaccess.h index c74c32ccc..4f2817689 100644 --- a/arch/mips/include/asm/uaccess.h +++ b/arch/mips/include/asm/uaccess.h @@ -1238,6 +1238,13 @@ __clear_user(void __user *addr, __kernel_size_t size) { __kernel_size_t res; +#ifdef CONFIG_CPU_MICROMIPS +/* micromips memset / bzero also clobbers t7 & t8 */ +#define bzero_clobbers "$4", "$5", "$6", __UA_t0, __UA_t1, "$15", "$24", "$31" +#else +#define bzero_clobbers "$4", "$5", "$6", __UA_t0, __UA_t1, "$31" +#endif /* CONFIG_CPU_MICROMIPS */ + if (eva_kernel_access()) { __asm__ __volatile__( "move\t$4, %1\n\t" @@ -1247,7 +1254,7 @@ __clear_user(void __user *addr, __kernel_size_t size) "move\t%0, $6" : "=r" (res) : "r" (addr), "r" (size) - : "$4", "$5", "$6", __UA_t0, __UA_t1, "$31"); + : bzero_clobbers); } else { might_fault(); __asm__ __volatile__( @@ -1258,7 +1265,7 @@ __clear_user(void __user *addr, __kernel_size_t size) "move\t%0, $6" : "=r" (res) : "r" (addr), "r" (size) - : "$4", "$5", "$6", __UA_t0, __UA_t1, "$31"); + : bzero_clobbers); } return res; From 5add579ffb1aa24adcb578f9c7865a3f618f017c Mon Sep 17 00:00:00 2001 From: Matt Redfearn Date: Thu, 29 Mar 2018 10:28:23 +0100 Subject: [PATCH 081/146] MIPS: memset.S: EVA & fault support for small_memset commit 8a8158c85e1e774a44fbe81106fa41138580dfd1 upstream. The MIPS kernel memset / bzero implementation includes a small_memset branch which is used when the region to be set is smaller than a long (4 bytes on 32bit, 8 bytes on 64bit). The current small_memset implementation uses a simple store byte loop to write the destination. There are 2 issues with this implementation: 1. When EVA mode is active, user and kernel address spaces may overlap. Currently the use of the sb instruction means kernel mode addressing is always used and an intended write to userspace may actually overwrite some critical kernel data. 2. If the write triggers a page fault, for example by calling __clear_user(NULL, 2), instead of gracefully handling the fault, an OOPS is triggered. Fix these issues by replacing the sb instruction with the EX() macro, which will emit EVA compatible instuctions as required. Additionally implement a fault fixup for small_memset which sets a2 to the number of bytes that could not be cleared (as defined by __clear_user). Reported-by: Chuanhua Lei Signed-off-by: Matt Redfearn Cc: Ralf Baechle Cc: linux-mips@linux-mips.org Cc: stable@vger.kernel.org Patchwork: https://patchwork.linux-mips.org/patch/18975/ Signed-off-by: James Hogan Signed-off-by: Greg Kroah-Hartman --- arch/mips/lib/memset.S | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/arch/mips/lib/memset.S b/arch/mips/lib/memset.S index 8f0019a2e..1e88105a4 100644 --- a/arch/mips/lib/memset.S +++ b/arch/mips/lib/memset.S @@ -218,7 +218,7 @@ 1: PTR_ADDIU a0, 1 /* fill bytewise */ R10KCBARRIER(0(ra)) bne t1, a0, 1b - sb a1, -1(a0) + EX(sb, a1, -1(a0), .Lsmall_fixup\@) 2: jr ra /* done */ move a2, zero @@ -257,6 +257,11 @@ jr ra andi v1, a2, STORMASK +.Lsmall_fixup\@: + PTR_SUBU a2, t1, a0 + jr ra + PTR_ADDIU a2, 1 + .endm /* From 9937222396e7ba3875f84cac636802f80c03aaf3 Mon Sep 17 00:00:00 2001 From: Matt Redfearn Date: Tue, 17 Apr 2018 15:52:21 +0100 Subject: [PATCH 082/146] MIPS: memset.S: Fix return of __clear_user from Lpartial_fixup commit daf70d89f80c6e1772233da9e020114b1254e7e0 upstream. The __clear_user function is defined to return the number of bytes that could not be cleared. From the underlying memset / bzero implementation this means setting register a2 to that number on return. Currently if a page fault is triggered within the memset_partial block, the value loaded into a2 on return is meaningless. The label .Lpartial_fixup\@ is jumped to on page fault. In order to work out how many bytes failed to copy, the exception handler should find how many bytes left in the partial block (andi a2, STORMASK), add that to the partial block end address (a2), and subtract the faulting address to get the remainder. Currently it incorrectly subtracts the partial block start address (t1), which has additionally been clobbered to generate a jump target in memset_partial. Fix this by adding the block end address instead. This issue was found with the following test code: int j, k; for (j = 0; j < 512; j++) { if ((k = clear_user(NULL, j)) != j) { pr_err("clear_user (NULL %d) returned %d\n", j, k); } } Which now passes on Creator Ci40 (MIPS32) and Cavium Octeon II (MIPS64). Suggested-by: James Hogan Signed-off-by: Matt Redfearn Cc: Ralf Baechle Cc: linux-mips@linux-mips.org Cc: stable@vger.kernel.org Patchwork: https://patchwork.linux-mips.org/patch/19108/ Signed-off-by: James Hogan Signed-off-by: Greg Kroah-Hartman --- arch/mips/lib/memset.S | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/mips/lib/memset.S b/arch/mips/lib/memset.S index 1e88105a4..a02d98bee 100644 --- a/arch/mips/lib/memset.S +++ b/arch/mips/lib/memset.S @@ -249,7 +249,7 @@ PTR_L t0, TI_TASK($28) andi a2, STORMASK LONG_L t0, THREAD_BUADDR(t0) - LONG_ADDU a2, t1 + LONG_ADDU a2, a0 jr ra LONG_SUBU a2, t0 From 26d81ee6b9c17c8e1f393b4ded645b60ea95e5f3 Mon Sep 17 00:00:00 2001 From: Matt Redfearn Date: Tue, 17 Apr 2018 16:40:00 +0100 Subject: [PATCH 083/146] MIPS: memset.S: Fix clobber of v1 in last_fixup commit c96eebf07692e53bf4dd5987510d8b550e793598 upstream. The label .Llast_fixup\@ is jumped to on page fault within the final byte set loop of memset (on < MIPSR6 architectures). For some reason, in this fault handler, the v1 register is randomly set to a2 & STORMASK. This clobbers v1 for the calling function. This can be observed with the following test code: static int __init __attribute__((optimize("O0"))) test_clear_user(void) { register int t asm("v1"); char *test; int j, k; pr_info("\n\n\nTesting clear_user\n"); test = vmalloc(PAGE_SIZE); for (j = 256; j < 512; j++) { t = 0xa5a5a5a5; if ((k = clear_user(test + PAGE_SIZE - 256, j)) != j - 256) { pr_err("clear_user (%px %d) returned %d\n", test + PAGE_SIZE - 256, j, k); } if (t != 0xa5a5a5a5) { pr_err("v1 was clobbered to 0x%x!\n", t); } } return 0; } late_initcall(test_clear_user); Which demonstrates that v1 is indeed clobbered (MIPS64): Testing clear_user v1 was clobbered to 0x1! v1 was clobbered to 0x2! v1 was clobbered to 0x3! v1 was clobbered to 0x4! v1 was clobbered to 0x5! v1 was clobbered to 0x6! v1 was clobbered to 0x7! Since the number of bytes that could not be set is already contained in a2, the andi placing a value in v1 is not necessary and actively harmful in clobbering v1. Reported-by: James Hogan Signed-off-by: Matt Redfearn Cc: Ralf Baechle Cc: linux-mips@linux-mips.org Cc: stable@vger.kernel.org Patchwork: https://patchwork.linux-mips.org/patch/19109/ Signed-off-by: James Hogan Signed-off-by: Greg Kroah-Hartman --- arch/mips/lib/memset.S | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/mips/lib/memset.S b/arch/mips/lib/memset.S index a02d98bee..2d33cf218 100644 --- a/arch/mips/lib/memset.S +++ b/arch/mips/lib/memset.S @@ -255,7 +255,7 @@ .Llast_fixup\@: jr ra - andi v1, a2, STORMASK + nop .Lsmall_fixup\@: PTR_SUBU a2, t1, a0 From d202b90e9e5b8e7eaa61f840de2606f1a273d33a Mon Sep 17 00:00:00 2001 From: Michael Neuling Date: Wed, 11 Apr 2018 13:37:58 +1000 Subject: [PATCH 084/146] powerpc/eeh: Fix enabling bridge MMIO windows commit 13a83eac373c49c0a081cbcd137e79210fe78acd upstream. On boot we save the configuration space of PCIe bridges. We do this so when we get an EEH event and everything gets reset that we can restore them. Unfortunately we save this state before we've enabled the MMIO space on the bridges. Hence if we have to reset the bridge when we come back MMIO is not enabled and we end up taking an PE freeze when the driver starts accessing again. This patch forces the memory/MMIO and bus mastering on when restoring bridges on EEH. Ideally we'd do this correctly by saving the configuration space writes later, but that will have to come later in a larger EEH rewrite. For now we have this simple fix. The original bug can be triggered on a boston machine by doing: echo 0x8000000000000000 > /sys/kernel/debug/powerpc/PCI0001/err_injct_outbound On boston, this PHB has a PCIe switch on it. Without this patch, you'll see two EEH events, 1 expected and 1 the failure we are fixing here. The second EEH event causes the anything under the PHB to disappear (i.e. the i40e eth). With this patch, only 1 EEH event occurs and devices properly recover. Fixes: 652defed4875 ("powerpc/eeh: Check PCIe link after reset") Cc: stable@vger.kernel.org # v3.11+ Reported-by: Pridhiviraj Paidipeddi Signed-off-by: Michael Neuling Acked-by: Russell Currey Signed-off-by: Michael Ellerman Signed-off-by: Greg Kroah-Hartman --- arch/powerpc/kernel/eeh_pe.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/arch/powerpc/kernel/eeh_pe.c b/arch/powerpc/kernel/eeh_pe.c index 98f81800e..304f07cfa 100644 --- a/arch/powerpc/kernel/eeh_pe.c +++ b/arch/powerpc/kernel/eeh_pe.c @@ -788,7 +788,8 @@ static void eeh_restore_bridge_bars(struct eeh_dev *edev) eeh_ops->write_config(pdn, 15*4, 4, edev->config_space[15]); /* PCI Command: 0x4 */ - eeh_ops->write_config(pdn, PCI_COMMAND, 4, edev->config_space[1]); + eeh_ops->write_config(pdn, PCI_COMMAND, 4, edev->config_space[1] | + PCI_COMMAND_MEMORY | PCI_COMMAND_MASTER); /* Check the PCIe link is ready */ eeh_bridge_check_link(edev); From c40f2ed343d00a02729b97290835a91da5a6126c Mon Sep 17 00:00:00 2001 From: Michael Ellerman Date: Mon, 16 Apr 2018 23:25:19 +1000 Subject: [PATCH 085/146] powerpc/lib: Fix off-by-one in alternate feature patching commit b8858581febb050688e276b956796bc4a78299ed upstream. When we patch an alternate feature section, we have to adjust any relative branches that branch out of the alternate section. But currently we have a bug if we have a branch that points to past the last instruction of the alternate section, eg: FTR_SECTION_ELSE 1: b 2f or 6,6,6 2: ALT_FTR_SECTION_END(...) nop This will result in a relative branch at 1 with a target that equals the end of the alternate section. That branch does not need adjusting when it's moved to the non-else location. Currently we do adjust it, resulting in a branch that goes off into the link-time location of the else section, which is junk. The fix is to not patch branches that have a target == end of the alternate section. Fixes: d20fe50a7b3c ("KVM: PPC: Book3S HV: Branch inside feature section") Fixes: 9b1a735de64c ("powerpc: Add logic to patch alternative feature sections") Cc: stable@vger.kernel.org # v2.6.27+ Signed-off-by: Michael Ellerman Signed-off-by: Greg Kroah-Hartman --- arch/powerpc/lib/feature-fixups.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/powerpc/lib/feature-fixups.c b/arch/powerpc/lib/feature-fixups.c index a18d648d3..3af014684 100644 --- a/arch/powerpc/lib/feature-fixups.c +++ b/arch/powerpc/lib/feature-fixups.c @@ -53,7 +53,7 @@ static int patch_alt_instruction(unsigned int *src, unsigned int *dest, unsigned int *target = (unsigned int *)branch_target(src); /* Branch within the section doesn't need translating */ - if (target < alt_start || target >= alt_end) { + if (target < alt_start || target > alt_end) { instr = translate_branch(dest, src); if (!instr) return 1; From 7a694263671e8ef8cb10d194e61fdcc462462899 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Mon, 2 Apr 2018 23:56:44 -0400 Subject: [PATCH 086/146] jffs2_kill_sb(): deal with failed allocations commit c66b23c2840446a82c389e4cb1a12eb2a71fa2e4 upstream. jffs2_fill_super() might fail to allocate jffs2_sb_info; jffs2_kill_sb() must survive that. Cc: stable@kernel.org Signed-off-by: Al Viro Signed-off-by: Greg Kroah-Hartman --- fs/jffs2/super.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/jffs2/super.c b/fs/jffs2/super.c index d86c5e317..600da1a4d 100644 --- a/fs/jffs2/super.c +++ b/fs/jffs2/super.c @@ -345,7 +345,7 @@ static void jffs2_put_super (struct super_block *sb) static void jffs2_kill_sb(struct super_block *sb) { struct jffs2_sb_info *c = JFFS2_SB_INFO(sb); - if (!(sb->s_flags & MS_RDONLY)) + if (c && !(sb->s_flags & MS_RDONLY)) jffs2_stop_garbage_collect_thread(c); kill_mtd_super(sb); kfree(c); From 062b0252eca2b1a48d133002070f706bb8630707 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Mon, 2 Apr 2018 23:50:31 -0400 Subject: [PATCH 087/146] hypfs_kill_super(): deal with failed allocations commit a24cd490739586a7d2da3549a1844e1d7c4f4fc4 upstream. hypfs_fill_super() might fail to allocate sbi; hypfs_kill_super() should not oops on that. Cc: stable@vger.kernel.org Signed-off-by: Al Viro Signed-off-by: Greg Kroah-Hartman --- arch/s390/hypfs/inode.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/s390/hypfs/inode.c b/arch/s390/hypfs/inode.c index b2e5902bd..c670279b3 100644 --- a/arch/s390/hypfs/inode.c +++ b/arch/s390/hypfs/inode.c @@ -318,7 +318,7 @@ static void hypfs_kill_super(struct super_block *sb) if (sb->s_root) hypfs_delete_tree(sb->s_root); - if (sb_info->update_file) + if (sb_info && sb_info->update_file) hypfs_remove(sb_info->update_file); kfree(sb->s_fs_info); sb->s_fs_info = NULL; From 4628afffda0729820521f7f5474171a1c10ddf26 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Tue, 3 Apr 2018 01:15:46 -0400 Subject: [PATCH 088/146] rpc_pipefs: fix double-dput() commit 4a3877c4cedd95543f8726b0a98743ed8db0c0fb upstream. if we ever hit rpc_gssd_dummy_depopulate() dentry passed to it has refcount equal to 1. __rpc_rmpipe() drops it and dput() done after that hits an already freed dentry. Cc: stable@kernel.org Signed-off-by: Al Viro Signed-off-by: Greg Kroah-Hartman --- net/sunrpc/rpc_pipe.c | 1 + 1 file changed, 1 insertion(+) diff --git a/net/sunrpc/rpc_pipe.c b/net/sunrpc/rpc_pipe.c index d81186d34..9103dd155 100644 --- a/net/sunrpc/rpc_pipe.c +++ b/net/sunrpc/rpc_pipe.c @@ -1375,6 +1375,7 @@ rpc_gssd_dummy_depopulate(struct dentry *pipe_dentry) struct dentry *clnt_dir = pipe_dentry->d_parent; struct dentry *gssd_dir = clnt_dir->d_parent; + dget(pipe_dentry); __rpc_rmpipe(d_inode(clnt_dir), pipe_dentry); __rpc_depopulate(clnt_dir, gssd_dummy_info_file, 0, 1); __rpc_depopulate(gssd_dir, gssd_dummy_clnt_dir, 0, 1); From 0bc9692d8a915b0f70c0e62d0570f6182c37d93b Mon Sep 17 00:00:00 2001 From: Al Viro Date: Thu, 19 Apr 2018 22:03:08 -0400 Subject: [PATCH 089/146] Don't leak MNT_INTERNAL away from internal mounts commit 16a34adb9392b2fe4195267475ab5b472e55292c upstream. We want it only for the stuff created by SB_KERNMOUNT mounts, *not* for their copies. As it is, creating a deep stack of bindings of /proc/*/ns/* somewhere in a new namespace and exiting yields a stack overflow. Cc: stable@kernel.org Reported-by: Alexander Aring Bisected-by: Kirill Tkhai Tested-by: Kirill Tkhai Tested-by: Alexander Aring Signed-off-by: Al Viro Signed-off-by: Greg Kroah-Hartman --- fs/namespace.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/fs/namespace.c b/fs/namespace.c index 38e42eb4b..0189f3f5d 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -1035,7 +1035,8 @@ static struct mount *clone_mnt(struct mount *old, struct dentry *root, goto out_free; } - mnt->mnt.mnt_flags = old->mnt.mnt_flags & ~(MNT_WRITE_HOLD|MNT_MARKED); + mnt->mnt.mnt_flags = old->mnt.mnt_flags; + mnt->mnt.mnt_flags &= ~(MNT_WRITE_HOLD|MNT_MARKED|MNT_INTERNAL); /* Don't allow unprivileged users to change mount flags */ if (flag & CL_UNPRIVILEGED) { mnt->mnt.mnt_flags |= MNT_LOCK_ATIME; From a66b8132fe183284cb162fcaa254b85e5a0b0e9f Mon Sep 17 00:00:00 2001 From: Ian Kent Date: Fri, 20 Apr 2018 14:55:59 -0700 Subject: [PATCH 090/146] autofs: mount point create should honour passed in mode commit 1e6306652ba18723015d1b4967fe9de55f042499 upstream. The autofs file system mkdir inode operation blindly sets the created directory mode to S_IFDIR | 0555, ingoring the passed in mode, which can cause selinux dac_override denials. But the function also checks if the caller is the daemon (as no-one else should be able to do anything here) so there's no point in not honouring the passed in mode, allowing the daemon to set appropriate mode when required. Link: http://lkml.kernel.org/r/152361593601.8051.14014139124905996173.stgit@pluto.themaw.net Signed-off-by: Ian Kent Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds Signed-off-by: Greg Kroah-Hartman --- fs/autofs4/root.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/autofs4/root.c b/fs/autofs4/root.c index 7a54c6a86..500098cdb 100644 --- a/fs/autofs4/root.c +++ b/fs/autofs4/root.c @@ -746,7 +746,7 @@ static int autofs4_dir_mkdir(struct inode *dir, struct dentry *dentry, umode_t m autofs4_del_active(dentry); - inode = autofs4_get_inode(dir->i_sb, S_IFDIR | 0555); + inode = autofs4_get_inode(dir->i_sb, S_IFDIR | mode); if (!inode) return -ENOMEM; d_add(dentry, inode); From db63c6f5daa139555b21688975e55e7b2a05654b Mon Sep 17 00:00:00 2001 From: Michal Hocko Date: Thu, 14 Jan 2016 15:20:12 -0800 Subject: [PATCH 091/146] mm: allow GFP_{FS,IO} for page_cache_read page cache allocation commit c20cd45eb01748f0fba77a504f956b000df4ea73 upstream. page_cache_read has been historically using page_cache_alloc_cold to allocate a new page. This means that mapping_gfp_mask is used as the base for the gfp_mask. Many filesystems are setting this mask to GFP_NOFS to prevent from fs recursion issues. page_cache_read is called from the vm_operations_struct::fault() context during the page fault. This context doesn't need the reclaim protection normally. ceph and ocfs2 which call filemap_fault from their fault handlers seem to be OK because they are not taking any fs lock before invoking generic implementation. xfs which takes XFS_MMAPLOCK_SHARED is safe from the reclaim recursion POV because this lock serializes truncate and punch hole with the page faults and it doesn't get involved in the reclaim. There is simply no reason to deliberately use a weaker allocation context when a __GFP_FS | __GFP_IO can be used. The GFP_NOFS protection might be even harmful. There is a push to fail GFP_NOFS allocations rather than loop within allocator indefinitely with a very limited reclaim ability. Once we start failing those requests the OOM killer might be triggered prematurely because the page cache allocation failure is propagated up the page fault path and end up in pagefault_out_of_memory. We cannot play with mapping_gfp_mask directly because that would be racy wrt. parallel page faults and it might interfere with other users who really rely on NOFS semantic from the stored gfp_mask. The mask is also inode proper so it would even be a layering violation. What we can do instead is to push the gfp_mask into struct vm_fault and allow fs layer to overwrite it should the callback need to be called with a different allocation context. Initialize the default to (mapping_gfp_mask | __GFP_FS | __GFP_IO) because this should be safe from the page fault path normally. Why do we care about mapping_gfp_mask at all then? Because this doesn't hold only reclaim protection flags but it also might contain zone and movability restrictions (GFP_DMA32, __GFP_MOVABLE and others) so we have to respect those. Signed-off-by: Michal Hocko Reported-by: Tetsuo Handa Acked-by: Jan Kara Acked-by: Vlastimil Babka Cc: Tetsuo Handa Cc: Mel Gorman Cc: Dave Chinner Cc: Mark Fasheh Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds Signed-off-by: Greg Kroah-Hartman --- include/linux/mm.h | 4 ++++ mm/filemap.c | 9 ++++----- mm/memory.c | 17 +++++++++++++++++ 3 files changed, 25 insertions(+), 5 deletions(-) diff --git a/include/linux/mm.h b/include/linux/mm.h index 355013f7b..6d6f83dda 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -236,10 +236,14 @@ extern pgprot_t protection_map[16]; * ->fault function. The vma's ->fault is responsible for returning a bitmask * of VM_FAULT_xxx flags that give details about how the fault was handled. * + * MM layer fills up gfp_mask for page allocations but fault handler might + * alter it if its implementation requires a different allocation context. + * * pgoff should be used in favour of virtual_address, if possible. */ struct vm_fault { unsigned int flags; /* FAULT_FLAG_xxx flags */ + gfp_t gfp_mask; /* gfp mask to be used for allocations */ pgoff_t pgoff; /* Logical page offset based on vma */ void __user *virtual_address; /* Faulting virtual address */ diff --git a/mm/filemap.c b/mm/filemap.c index c8f86dbef..5411612a5 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -1899,19 +1899,18 @@ EXPORT_SYMBOL(generic_file_read_iter); * This adds the requested page to the page cache if it isn't already there, * and schedules an I/O to read in its contents from disk. */ -static int page_cache_read(struct file *file, pgoff_t offset) +static int page_cache_read(struct file *file, pgoff_t offset, gfp_t gfp_mask) { struct address_space *mapping = file->f_mapping; struct page *page; int ret; do { - page = page_cache_alloc_cold(mapping); + page = __page_cache_alloc(gfp_mask|__GFP_COLD); if (!page) return -ENOMEM; - ret = add_to_page_cache_lru(page, mapping, offset, - mapping_gfp_constraint(mapping, GFP_KERNEL)); + ret = add_to_page_cache_lru(page, mapping, offset, gfp_mask & GFP_KERNEL); if (ret == 0) ret = mapping->a_ops->readpage(file, page); else if (ret == -EEXIST) @@ -2139,7 +2138,7 @@ int filemap_fault(struct vm_area_struct *vma, struct vm_fault *vmf) * effect. */ task_set_in_pagefault(current); - error = page_cache_read(file, offset); + error = page_cache_read(file, offset, vmf->gfp_mask); task_clear_in_pagefault(current); /* diff --git a/mm/memory.c b/mm/memory.c index cc72034cf..dd3533e80 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -1994,6 +1994,20 @@ static inline void cow_user_page(struct page *dst, struct page *src, unsigned lo copy_user_highpage(dst, src, va, vma); } +static gfp_t __get_fault_gfp_mask(struct vm_area_struct *vma) +{ + struct file *vm_file = vma->vm_file; + + if (vm_file) + return mapping_gfp_mask(vm_file->f_mapping) | __GFP_FS | __GFP_IO; + + /* + * Special mappings (e.g. VDSO) do not have any file so fake + * a default GFP_KERNEL for them. + */ + return GFP_KERNEL; +} + /* * Notify the address space that the page is about to become writable so that * it can prohibit this or wait for the page to get into an appropriate state. @@ -2009,6 +2023,7 @@ static int do_page_mkwrite(struct vm_area_struct *vma, struct page *page, vmf.virtual_address = (void __user *)(address & PAGE_MASK); vmf.pgoff = page->index; vmf.flags = FAULT_FLAG_WRITE|FAULT_FLAG_MKWRITE; + vmf.gfp_mask = __get_fault_gfp_mask(vma); vmf.page = page; vmf.cow_page = NULL; @@ -2792,6 +2807,7 @@ static int __do_fault(struct vm_area_struct *vma, unsigned long address, vmf.pgoff = pgoff; vmf.flags = flags; vmf.page = NULL; + vmf.gfp_mask = __get_fault_gfp_mask(vma); vmf.cow_page = cow_page; ret = vma->vm_ops->fault(vma, &vmf); @@ -2958,6 +2974,7 @@ static void do_fault_around(struct vm_area_struct *vma, unsigned long address, vmf.pgoff = pgoff; vmf.max_pgoff = max_pgoff; vmf.flags = flags; + vmf.gfp_mask = __get_fault_gfp_mask(vma); vma->vm_ops->map_pages(vma, &vmf); } From 98b3413b88e1e99d59335807c8be1f6b14c6c475 Mon Sep 17 00:00:00 2001 From: Matthew Wilcox Date: Fri, 20 Apr 2018 14:56:20 -0700 Subject: [PATCH 092/146] mm/filemap.c: fix NULL pointer in page_cache_tree_insert() commit abc1be13fd113ddef5e2d807a466286b864caed3 upstream. f2fs specifies the __GFP_ZERO flag for allocating some of its pages. Unfortunately, the page cache also uses the mapping's GFP flags for allocating radix tree nodes. It always masked off the __GFP_HIGHMEM flag, and masks off __GFP_ZERO in some paths, but not all. That causes radix tree nodes to be allocated with a NULL list_head, which causes backtraces like: __list_del_entry+0x30/0xd0 list_lru_del+0xac/0x1ac page_cache_tree_insert+0xd8/0x110 The __GFP_DMA and __GFP_DMA32 flags would also be able to sneak through if they are ever used. Fix them all by using GFP_RECLAIM_MASK at the innermost location, and remove it from earlier in the callchain. Link: http://lkml.kernel.org/r/20180411060320.14458-2-willy@infradead.org Fixes: 449dd6984d0e ("mm: keep page cache radix tree nodes in check") Signed-off-by: Matthew Wilcox Reported-by: Chris Fries Debugged-by: Minchan Kim Acked-by: Johannes Weiner Acked-by: Michal Hocko Reviewed-by: Jan Kara Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds Signed-off-by: Greg Kroah-Hartman --- mm/filemap.c | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/mm/filemap.c b/mm/filemap.c index 5411612a5..5ea1e45ac 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -593,7 +593,7 @@ int replace_page_cache_page(struct page *old, struct page *new, gfp_t gfp_mask) VM_BUG_ON_PAGE(!PageLocked(new), new); VM_BUG_ON_PAGE(new->mapping, new); - error = radix_tree_preload(gfp_mask & ~__GFP_HIGHMEM); + error = radix_tree_preload(gfp_mask & GFP_RECLAIM_MASK); if (!error) { struct address_space *mapping = old->mapping; void (*freepage)(struct page *); @@ -652,7 +652,7 @@ static int __add_to_page_cache_locked(struct page *page, return error; } - error = radix_tree_maybe_preload(gfp_mask & ~__GFP_HIGHMEM); + error = radix_tree_maybe_preload(gfp_mask & GFP_RECLAIM_MASK); if (error) { if (!huge) mem_cgroup_cancel_charge(page, memcg); @@ -1218,8 +1218,7 @@ struct page *pagecache_get_page(struct address_space *mapping, pgoff_t offset, if (fgp_flags & FGP_ACCESSED) __SetPageReferenced(page); - err = add_to_page_cache_lru(page, mapping, offset, - gfp_mask & GFP_RECLAIM_MASK); + err = add_to_page_cache_lru(page, mapping, offset, gfp_mask); if (unlikely(err)) { page_cache_release(page); page = NULL; @@ -1910,7 +1909,7 @@ static int page_cache_read(struct file *file, pgoff_t offset, gfp_t gfp_mask) if (!page) return -ENOMEM; - ret = add_to_page_cache_lru(page, mapping, offset, gfp_mask & GFP_KERNEL); + ret = add_to_page_cache_lru(page, mapping, offset, gfp_mask); if (ret == 0) ret = mapping->a_ops->readpage(file, page); else if (ret == -EEXIST) From 6fa4b6dcbbaa0af55c9b8d9235948d3a0a4a1fd4 Mon Sep 17 00:00:00 2001 From: wangguang Date: Thu, 15 Sep 2016 11:32:46 -0400 Subject: [PATCH 093/146] ext4: bugfix for mmaped pages in mpage_release_unused_pages() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit commit 4e800c0359d9a53e6bf0ab216954971b2515247f upstream. Pages clear buffers after ext4 delayed block allocation failed, However, it does not clean its pte_dirty flag. if the pages unmap ,in cording to the pte_dirty , unmap_page_range may try to call __set_page_dirty, which may lead to the bugon at mpage_prepare_extent_to_map:head = page_buffers(page);. This patch just call clear_page_dirty_for_io to clean pte_dirty at mpage_release_unused_pages for pages mmaped. Steps to reproduce the bug: (1) mmap a file in ext4 addr = (char *)mmap(NULL, 4096, PROT_READ|PROT_WRITE, MAP_SHARED, fd, 0); memset(addr, 'i', 4096); (2) return EIO at ext4_writepages->mpage_map_and_submit_extent->mpage_map_one_extent which causes this log message to be print: ext4_msg(sb, KERN_CRIT, "Delayed block allocation failed for " "inode %lu at logical offset %llu with" " max blocks %u with error %d", inode->i_ino, (unsigned long long)map->m_lblk, (unsigned)map->m_len, -err); (3)Unmap the addr cause warning at __set_page_dirty:WARN_ON_ONCE(warn && !PageUptodate(page)); (4) wait for a minute,then bugon happen. Cc: stable@vger.kernel.org Signed-off-by: wangguang Signed-off-by: Theodore Ts'o [@nathanchance: Resolved conflict from lack of 09cbfeaf1a5a6] Signed-off-by: Nathan Chancellor Signed-off-by: Greg Kroah-Hartman --- fs/ext4/inode.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 04aed40ae..154f7ed65 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -1526,6 +1526,8 @@ static void mpage_release_unused_pages(struct mpage_da_data *mpd, BUG_ON(!PageLocked(page)); BUG_ON(PageWriteback(page)); if (invalidate) { + if (page_mapped(page)) + clear_page_dirty_for_io(page); block_invalidatepage(page, 0, PAGE_CACHE_SIZE); ClearPageUptodate(page); } From d7f5b458f642c707433ce1fe31d962408a5201bd Mon Sep 17 00:00:00 2001 From: Amir Goldstein Date: Wed, 4 Apr 2018 23:42:18 +0300 Subject: [PATCH 094/146] fanotify: fix logic of events on child commit 54a307ba8d3cd00a3902337ffaae28f436eeb1a4 upstream. When event on child inodes are sent to the parent inode mark and parent inode mark was not marked with FAN_EVENT_ON_CHILD, the event will not be delivered to the listener process. However, if the same process also has a mount mark, the event to the parent inode will be delivered regadless of the mount mark mask. This behavior is incorrect in the case where the mount mark mask does not contain the specific event type. For example, the process adds a mark on a directory with mask FAN_MODIFY (without FAN_EVENT_ON_CHILD) and a mount mark with mask FAN_CLOSE_NOWRITE (without FAN_ONDIR). A modify event on a file inside that directory (and inside that mount) should not create a FAN_MODIFY event, because neither of the marks requested to get that event on the file. Fixes: 1968f5eed54c ("fanotify: use both marks when possible") Cc: stable Signed-off-by: Amir Goldstein Signed-off-by: Jan Kara [natechancellor: Fix small conflict due to lack of 3cd5eca8d7a2f] Signed-off-by: Nathan Chancellor Signed-off-by: Greg Kroah-Hartman --- fs/notify/fanotify/fanotify.c | 34 +++++++++++++++------------------- 1 file changed, 15 insertions(+), 19 deletions(-) diff --git a/fs/notify/fanotify/fanotify.c b/fs/notify/fanotify/fanotify.c index e0e5f7c3c..8a459b179 100644 --- a/fs/notify/fanotify/fanotify.c +++ b/fs/notify/fanotify/fanotify.c @@ -92,7 +92,7 @@ static bool fanotify_should_send_event(struct fsnotify_mark *inode_mark, u32 event_mask, void *data, int data_type) { - __u32 marks_mask, marks_ignored_mask; + __u32 marks_mask = 0, marks_ignored_mask = 0; struct path *path = data; pr_debug("%s: inode_mark=%p vfsmnt_mark=%p mask=%x data=%p" @@ -108,24 +108,20 @@ static bool fanotify_should_send_event(struct fsnotify_mark *inode_mark, !d_can_lookup(path->dentry)) return false; - if (inode_mark && vfsmnt_mark) { - marks_mask = (vfsmnt_mark->mask | inode_mark->mask); - marks_ignored_mask = (vfsmnt_mark->ignored_mask | inode_mark->ignored_mask); - } else if (inode_mark) { - /* - * if the event is for a child and this inode doesn't care about - * events on the child, don't send it! - */ - if ((event_mask & FS_EVENT_ON_CHILD) && - !(inode_mark->mask & FS_EVENT_ON_CHILD)) - return false; - marks_mask = inode_mark->mask; - marks_ignored_mask = inode_mark->ignored_mask; - } else if (vfsmnt_mark) { - marks_mask = vfsmnt_mark->mask; - marks_ignored_mask = vfsmnt_mark->ignored_mask; - } else { - BUG(); + /* + * if the event is for a child and this inode doesn't care about + * events on the child, don't send it! + */ + if (inode_mark && + (!(event_mask & FS_EVENT_ON_CHILD) || + (inode_mark->mask & FS_EVENT_ON_CHILD))) { + marks_mask |= inode_mark->mask; + marks_ignored_mask |= inode_mark->ignored_mask; + } + + if (vfsmnt_mark) { + marks_mask |= vfsmnt_mark->mask; + marks_ignored_mask |= vfsmnt_mark->ignored_mask; } if (d_is_dir(path->dentry) && From 05c905061c6003475f8342037b11a1bfbe0246b4 Mon Sep 17 00:00:00 2001 From: Greg Thelen Date: Fri, 20 Apr 2018 14:55:42 -0700 Subject: [PATCH 095/146] writeback: safer lock nesting commit 2e898e4c0a3897ccd434adac5abb8330194f527b upstream. lock_page_memcg()/unlock_page_memcg() use spin_lock_irqsave/restore() if the page's memcg is undergoing move accounting, which occurs when a process leaves its memcg for a new one that has memory.move_charge_at_immigrate set. unlocked_inode_to_wb_begin,end() use spin_lock_irq/spin_unlock_irq() if the given inode is switching writeback domains. Switches occur when enough writes are issued from a new domain. This existing pattern is thus suspicious: lock_page_memcg(page); unlocked_inode_to_wb_begin(inode, &locked); ... unlocked_inode_to_wb_end(inode, locked); unlock_page_memcg(page); If both inode switch and process memcg migration are both in-flight then unlocked_inode_to_wb_end() will unconditionally enable interrupts while still holding the lock_page_memcg() irq spinlock. This suggests the possibility of deadlock if an interrupt occurs before unlock_page_memcg(). truncate __cancel_dirty_page lock_page_memcg unlocked_inode_to_wb_begin unlocked_inode_to_wb_end end_page_writeback test_clear_page_writeback lock_page_memcg unlock_page_memcg Due to configuration limitations this deadlock is not currently possible because we don't mix cgroup writeback (a cgroupv2 feature) and memory.move_charge_at_immigrate (a cgroupv1 feature). If the kernel is hacked to always claim inode switching and memcg moving_account, then this script triggers lockup in less than a minute: cd /mnt/cgroup/memory mkdir a b echo 1 > a/memory.move_charge_at_immigrate echo 1 > b/memory.move_charge_at_immigrate ( echo $BASHPID > a/cgroup.procs while true; do dd if=/dev/zero of=/mnt/big bs=1M count=256 done ) & while true; do sync done & sleep 1h & SLEEP=$! while true; do echo $SLEEP > a/cgroup.procs echo $SLEEP > b/cgroup.procs done The deadlock does not seem possible, so it's debatable if there's any reason to modify the kernel. I suggest we should to prevent future surprises. And Wang Long said "this deadlock occurs three times in our environment", so there's more reason to apply this, even to stable. Stable 4.4 has minor conflicts applying this patch. For a clean 4.4 patch see "[PATCH for-4.4] writeback: safer lock nesting" https://lkml.org/lkml/2018/4/11/146 Wang Long said "this deadlock occurs three times in our environment" [gthelen@google.com: v4] Link: http://lkml.kernel.org/r/20180411084653.254724-1-gthelen@google.com [akpm@linux-foundation.org: comment tweaks, struct initialization simplification] Change-Id: Ibb773e8045852978f6207074491d262f1b3fb613 Link: http://lkml.kernel.org/r/20180410005908.167976-1-gthelen@google.com Fixes: 682aa8e1a6a1 ("writeback: implement unlocked_inode_to_wb transaction and use it for stat updates") Signed-off-by: Greg Thelen Reported-by: Wang Long Acked-by: Wang Long Acked-by: Michal Hocko Reviewed-by: Andrew Morton Cc: Johannes Weiner Cc: Tejun Heo Cc: Nicholas Piggin Cc: [v4.2+] Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds [natechancellor: Applied to 4.4 based on Greg's backport on lkml.org] Signed-off-by: Nathan Chancellor Signed-off-by: Greg Kroah-Hartman --- fs/fs-writeback.c | 7 ++++--- include/linux/backing-dev-defs.h | 5 +++++ include/linux/backing-dev.h | 31 +++++++++++++++++-------------- mm/page-writeback.c | 18 +++++++++--------- 4 files changed, 35 insertions(+), 26 deletions(-) diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index ddba99214..85afd26ca 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -748,11 +748,12 @@ int inode_congested(struct inode *inode, int cong_bits) */ if (inode && inode_to_wb_is_valid(inode)) { struct bdi_writeback *wb; - bool locked, congested; + struct wb_lock_cookie lock_cookie = {}; + bool congested; - wb = unlocked_inode_to_wb_begin(inode, &locked); + wb = unlocked_inode_to_wb_begin(inode, &lock_cookie); congested = wb_congested(wb, cong_bits); - unlocked_inode_to_wb_end(inode, locked); + unlocked_inode_to_wb_end(inode, &lock_cookie); return congested; } diff --git a/include/linux/backing-dev-defs.h b/include/linux/backing-dev-defs.h index 5d5b98d32..cfb03abba 100644 --- a/include/linux/backing-dev-defs.h +++ b/include/linux/backing-dev-defs.h @@ -197,6 +197,11 @@ static inline void set_bdi_congested(struct backing_dev_info *bdi, int sync) set_wb_congested(bdi->wb.congested, sync); } +struct wb_lock_cookie { + bool locked; + unsigned long flags; +}; + #ifdef CONFIG_CGROUP_WRITEBACK /** diff --git a/include/linux/backing-dev.h b/include/linux/backing-dev.h index 090356561..38f140216 100644 --- a/include/linux/backing-dev.h +++ b/include/linux/backing-dev.h @@ -374,7 +374,7 @@ static inline struct bdi_writeback *inode_to_wb(struct inode *inode) /** * unlocked_inode_to_wb_begin - begin unlocked inode wb access transaction * @inode: target inode - * @lockedp: temp bool output param, to be passed to the end function + * @cookie: output param, to be passed to the end function * * The caller wants to access the wb associated with @inode but isn't * holding inode->i_lock, mapping->tree_lock or wb->list_lock. This @@ -382,12 +382,12 @@ static inline struct bdi_writeback *inode_to_wb(struct inode *inode) * association doesn't change until the transaction is finished with * unlocked_inode_to_wb_end(). * - * The caller must call unlocked_inode_to_wb_end() with *@lockdep - * afterwards and can't sleep during transaction. IRQ may or may not be - * disabled on return. + * The caller must call unlocked_inode_to_wb_end() with *@cookie afterwards and + * can't sleep during the transaction. IRQs may or may not be disabled on + * return. */ static inline struct bdi_writeback * -unlocked_inode_to_wb_begin(struct inode *inode, bool *lockedp) +unlocked_inode_to_wb_begin(struct inode *inode, struct wb_lock_cookie *cookie) { rcu_read_lock(); @@ -395,10 +395,10 @@ unlocked_inode_to_wb_begin(struct inode *inode, bool *lockedp) * Paired with store_release in inode_switch_wb_work_fn() and * ensures that we see the new wb if we see cleared I_WB_SWITCH. */ - *lockedp = smp_load_acquire(&inode->i_state) & I_WB_SWITCH; + cookie->locked = smp_load_acquire(&inode->i_state) & I_WB_SWITCH; - if (unlikely(*lockedp)) - spin_lock_irq(&inode->i_mapping->tree_lock); + if (unlikely(cookie->locked)) + spin_lock_irqsave(&inode->i_mapping->tree_lock, cookie->flags); /* * Protected by either !I_WB_SWITCH + rcu_read_lock() or tree_lock. @@ -410,12 +410,14 @@ unlocked_inode_to_wb_begin(struct inode *inode, bool *lockedp) /** * unlocked_inode_to_wb_end - end inode wb access transaction * @inode: target inode - * @locked: *@lockedp from unlocked_inode_to_wb_begin() + * @cookie: @cookie from unlocked_inode_to_wb_begin() */ -static inline void unlocked_inode_to_wb_end(struct inode *inode, bool locked) +static inline void unlocked_inode_to_wb_end(struct inode *inode, + struct wb_lock_cookie *cookie) { - if (unlikely(locked)) - spin_unlock_irq(&inode->i_mapping->tree_lock); + if (unlikely(cookie->locked)) + spin_unlock_irqrestore(&inode->i_mapping->tree_lock, + cookie->flags); rcu_read_unlock(); } @@ -462,12 +464,13 @@ static inline struct bdi_writeback *inode_to_wb(struct inode *inode) } static inline struct bdi_writeback * -unlocked_inode_to_wb_begin(struct inode *inode, bool *lockedp) +unlocked_inode_to_wb_begin(struct inode *inode, struct wb_lock_cookie *cookie) { return inode_to_wb(inode); } -static inline void unlocked_inode_to_wb_end(struct inode *inode, bool locked) +static inline void unlocked_inode_to_wb_end(struct inode *inode, + struct wb_lock_cookie *cookie) { } diff --git a/mm/page-writeback.c b/mm/page-writeback.c index c6843c6a3..9b096e3f4 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -2589,13 +2589,13 @@ void account_page_redirty(struct page *page) if (mapping && mapping_cap_account_dirty(mapping)) { struct inode *inode = mapping->host; struct bdi_writeback *wb; - bool locked; + struct wb_lock_cookie cookie = {}; - wb = unlocked_inode_to_wb_begin(inode, &locked); + wb = unlocked_inode_to_wb_begin(inode, &cookie); current->nr_dirtied--; dec_zone_page_state(page, NR_DIRTIED); dec_wb_stat(wb, WB_DIRTIED); - unlocked_inode_to_wb_end(inode, locked); + unlocked_inode_to_wb_end(inode, &cookie); } } EXPORT_SYMBOL(account_page_redirty); @@ -2701,15 +2701,15 @@ void cancel_dirty_page(struct page *page) struct inode *inode = mapping->host; struct bdi_writeback *wb; struct mem_cgroup *memcg; - bool locked; + struct wb_lock_cookie cookie = {}; memcg = mem_cgroup_begin_page_stat(page); - wb = unlocked_inode_to_wb_begin(inode, &locked); + wb = unlocked_inode_to_wb_begin(inode, &cookie); if (TestClearPageDirty(page)) account_page_cleaned(page, mapping, memcg, wb); - unlocked_inode_to_wb_end(inode, locked); + unlocked_inode_to_wb_end(inode, &cookie); mem_cgroup_end_page_stat(memcg); } else { ClearPageDirty(page); @@ -2742,7 +2742,7 @@ int clear_page_dirty_for_io(struct page *page) struct inode *inode = mapping->host; struct bdi_writeback *wb; struct mem_cgroup *memcg; - bool locked; + struct wb_lock_cookie cookie = {}; /* * Yes, Virginia, this is indeed insane. @@ -2780,14 +2780,14 @@ int clear_page_dirty_for_io(struct page *page) * exclusion. */ memcg = mem_cgroup_begin_page_stat(page); - wb = unlocked_inode_to_wb_begin(inode, &locked); + wb = unlocked_inode_to_wb_begin(inode, &cookie); if (TestClearPageDirty(page)) { mem_cgroup_dec_page_stat(memcg, MEM_CGROUP_STAT_DIRTY); dec_zone_page_state(page, NR_FILE_DIRTY); dec_wb_stat(wb, WB_RECLAIMABLE); ret = 1; } - unlocked_inode_to_wb_end(inode, locked); + unlocked_inode_to_wb_end(inode, &cookie); mem_cgroup_end_page_stat(memcg); return ret; } From 5dc6f8bf9b85054f923bf70b9009f2f51af7fa59 Mon Sep 17 00:00:00 2001 From: Greg Kroah-Hartman Date: Tue, 24 Apr 2018 09:32:12 +0200 Subject: [PATCH 096/146] Linux 4.4.129 --- Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Makefile b/Makefile index d1769dd0c..5a9e77878 100644 --- a/Makefile +++ b/Makefile @@ -1,6 +1,6 @@ VERSION = 4 PATCHLEVEL = 4 -SUBLEVEL = 128 +SUBLEVEL = 129 EXTRAVERSION = NAME = Blurry Fish Butt From a5918ebdbe96077695942cf1abd86d780f05c78a Mon Sep 17 00:00:00 2001 From: Dave Kleikamp Date: Thu, 15 Dec 2011 15:44:45 -0600 Subject: [PATCH 097/146] AIO: Don't plug the I/O queue in do_io_submit() Asynchronous I/O latency to a solid-state disk greatly increased between the 2.6.32 and 3.0 kernels. By removing the plug from do_io_submit(), we observed a 34% improvement in the I/O latency. Unfortunately, at this level, we don't know if the request is to a rotating disk or not. Change-Id: I7101df956473ed9fd5dcff18e473dd93b688a5c1 Signed-off-by: Dave Kleikamp Cc: linux-aio@kvack.org Cc: Chris Mason Cc: Jens Axboe Cc: Andi Kleen Cc: Jeff Moyer Signed-off-by: engstk --- fs/aio.c | 4 ---- 1 file changed, 4 deletions(-) diff --git a/fs/aio.c b/fs/aio.c index 88ede4a84..f77b87a64 100644 --- a/fs/aio.c +++ b/fs/aio.c @@ -1596,7 +1596,6 @@ long do_io_submit(aio_context_t ctx_id, long nr, struct kioctx *ctx; long ret = 0; int i = 0; - struct blk_plug plug; if (unlikely(nr < 0)) return -EINVAL; @@ -1613,8 +1612,6 @@ long do_io_submit(aio_context_t ctx_id, long nr, return -EINVAL; } - blk_start_plug(&plug); - /* * AKPM: should this return a partial result if some of the IOs were * successfully submitted? @@ -1637,7 +1634,6 @@ long do_io_submit(aio_context_t ctx_id, long nr, if (ret) break; } - blk_finish_plug(&plug); percpu_ref_put(&ctx->users); return i ? i : ret; From a73161ff671b520fa64473d09ff4c9d29ab4b740 Mon Sep 17 00:00:00 2001 From: myfluxi Date: Wed, 8 Jan 2014 01:25:14 +0100 Subject: [PATCH 098/146] PM: devfreq: Use high priority workqueue It does not make sense to run kgsl on high and devfreq on regular priority. Change-Id: Ie5e6c9353a4e1324a6a49278e5ad3638462f551c Signed-off-by: engstk --- drivers/devfreq/devfreq.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/devfreq/devfreq.c b/drivers/devfreq/devfreq.c index 21768b337..4410041c3 100644 --- a/drivers/devfreq/devfreq.c +++ b/drivers/devfreq/devfreq.c @@ -1077,7 +1077,9 @@ static int __init devfreq_init(void) return PTR_ERR(devfreq_class); } - devfreq_wq = create_freezable_workqueue("devfreq_wq"); + devfreq_wq = alloc_workqueue("devfreq_wq", + WQ_HIGHPRI | WQ_UNBOUND | WQ_FREEZABLE | + WQ_MEM_RECLAIM, 0); if (!devfreq_wq) { class_destroy(devfreq_class); pr_err("%s: couldn't create workqueue\n", __FILE__); From d3503a7af59ee547753f40d534bff62aa7d4fc58 Mon Sep 17 00:00:00 2001 From: Jeff Liu Date: Sat, 5 Jan 2013 04:53:18 +0100 Subject: [PATCH 099/146] binfmt_elf.c: use get_random_int() to fix entropy depleting Changes: -------- v4->v3: - s/random_stack_user()/get_atrandom_bytes()/ - Move this function to ahead of its use to avoid the predeclaration. v3->v2: - Tweak code comments of random_stack_user(). - Remove redundant bits mask and shift upon the random variable. v2->v1: - Fix random copy to check up buffer length that are not 4-byte multiples. v3 can be found at: http://www.spinics.net/lists/linux-fsdevel/msg59597.html v2 can be found at: http://www.spinics.net/lists/linux-fsdevel/msg59418.html v1 can be found at: http://www.spinics.net/lists/linux-fsdevel/msg59128.html Thanks, -Jeff Entropy is quickly depleted under normal operations like ls(1), cat(1), etc... between 2.6.30 to current mainline, for instance: $ cat /proc/sys/kernel/random/entropy_avail 3428 $ cat /proc/sys/kernel/random/entropy_avail 2911 $cat /proc/sys/kernel/random/entropy_avail 2620 We observed this problem has been occurring since 2.6.30 with fs/binfmt_elf.c: create_elf_tables()->get_random_bytes(), introduced by f06295b44c296c8f ("ELF: implement AT_RANDOM for glibc PRNG seeding"). /* * Generate 16 random bytes for userspace PRNG seeding. */ get_random_bytes(k_rand_bytes, sizeof(k_rand_bytes)); The patch introduces a wrapper around get_random_int() which has lower overhead than calling get_random_bytes() directly. With this patch applied: $ cat /proc/sys/kernel/random/entropy_avail 2731 $ cat /proc/sys/kernel/random/entropy_avail 2802 $ cat /proc/sys/kernel/random/entropy_avail 2878 Analyzed by John Sobecki. Signed-off-by: Jie Liu Cc: Andrew Morton Cc: Al Viro Cc: Andreas Dilger Cc: Alan Cox Cc: Arnd Bergmann Cc: John Sobecki Cc: James Morris Cc: Jakub Jelinek Cc: Ted Ts'o Cc: Greg Kroah-Hartman Cc: Kees Cook Cc: Ulrich Drepper Signed-off-by: flar2 Signed-off-by: engstk --- fs/binfmt_elf.c | 21 ++++++++++++++++++++- 1 file changed, 20 insertions(+), 1 deletion(-) diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c index 8a0243efd..991acb78f 100644 --- a/fs/binfmt_elf.c +++ b/fs/binfmt_elf.c @@ -147,6 +147,25 @@ static int padzero(unsigned long elf_bss) #define ELF_BASE_PLATFORM NULL #endif +/* + * Use get_random_int() to implement AT_RANDOM while avoiding depletion + * of the entropy pool. + */ +static void get_atrandom_bytes(unsigned char *buf, size_t nbytes) +{ + unsigned char *p = buf; + + while (nbytes) { + unsigned int random_variable; + size_t chunk = min(nbytes, sizeof(random_variable)); + + random_variable = get_random_int(); + memcpy(p, &random_variable, chunk); + p += chunk; + nbytes -= chunk; + } +} + static int create_elf_tables(struct linux_binprm *bprm, struct elfhdr *exec, unsigned long load_addr, unsigned long interp_load_addr) @@ -208,7 +227,7 @@ create_elf_tables(struct linux_binprm *bprm, struct elfhdr *exec, /* * Generate 16 random bytes for userspace PRNG seeding. */ - get_random_bytes(k_rand_bytes, sizeof(k_rand_bytes)); + get_atrandom_bytes(k_rand_bytes, sizeof(k_rand_bytes)); u_rand_bytes = (elf_addr_t __user *) STACK_ALLOC(p, sizeof(k_rand_bytes)); if (__copy_to_user(u_rand_bytes, k_rand_bytes, sizeof(k_rand_bytes))) From 0e496a2db27e90428a06b0dfb73097249bd43102 Mon Sep 17 00:00:00 2001 From: John Dias Date: Fri, 12 Aug 2016 08:36:08 -0700 Subject: [PATCH 100/146] staging: binder - Set binder_debug_mask=0 to suppress logging Excessive logging -- not present on angler -- is affecting performance, contributing to missed audio deadlines and likely other latency-dependent tasks. Bug: 30375418 Change-Id: I88b9c7fa4540ad46e564f44a0e589b5215e8487d Signed-off-by: Alex Naidis Signed-off-by: engstk --- drivers/android/binder.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/drivers/android/binder.c b/drivers/android/binder.c index 9663fcacc..8889a9933 100644 --- a/drivers/android/binder.c +++ b/drivers/android/binder.c @@ -166,8 +166,7 @@ enum { BINDER_DEBUG_PRIORITY_CAP = 1U << 13, BINDER_DEBUG_SPINLOCKS = 1U << 14, }; -static uint32_t binder_debug_mask = BINDER_DEBUG_USER_ERROR | - BINDER_DEBUG_FAILED_TRANSACTION | BINDER_DEBUG_DEAD_TRANSACTION; +static uint32_t binder_debug_mask = 0; module_param_named(debug_mask, binder_debug_mask, uint, S_IWUSR | S_IRUGO); static char *binder_devices_param = CONFIG_ANDROID_BINDER_DEVICES; From 4a0e7d00f7cdc9e55155951c11a7fc43de5af195 Mon Sep 17 00:00:00 2001 From: Pranav Vashi Date: Sun, 20 Mar 2016 12:23:13 +0530 Subject: [PATCH 101/146] op5: Add state notifier driver Change-Id: Iea2235dfb59846cdc8a8cdc9f37d28a38ff6beba Signed-off-by: Pranav Vashi Signed-off-by: Joe Maples # Conflicts: # drivers/soc/qcom/Kconfig # drivers/soc/qcom/Makefile --- drivers/soc/qcom/Kconfig | 4 + drivers/soc/qcom/Makefile | 2 + drivers/soc/qcom/state_notifier.c | 129 ++++++++++++++++++++++++++++++ include/linux/state_notifier.h | 20 +++++ 4 files changed, 155 insertions(+) create mode 100644 drivers/soc/qcom/state_notifier.c create mode 100644 include/linux/state_notifier.h diff --git a/drivers/soc/qcom/Kconfig b/drivers/soc/qcom/Kconfig index eec76141d..dd32ece1b 100644 --- a/drivers/soc/qcom/Kconfig +++ b/drivers/soc/qcom/Kconfig @@ -49,3 +49,7 @@ config QCOM_SMD_RPM Say M here if you want to include support for the Qualcomm RPM as a module. This will build a module called "qcom-smd-rpm". + +config STATE_NOTIFIER + bool "State Notifier" + diff --git a/drivers/soc/qcom/Makefile b/drivers/soc/qcom/Makefile index 10a93d168..6dd0c6cc7 100644 --- a/drivers/soc/qcom/Makefile +++ b/drivers/soc/qcom/Makefile @@ -3,3 +3,5 @@ obj-$(CONFIG_QCOM_PM) += spm.o obj-$(CONFIG_QCOM_SMD) += smd.o obj-$(CONFIG_QCOM_SMD_RPM) += smd-rpm.o obj-$(CONFIG_QCOM_SMEM) += smem.o + +obj-$(CONFIG_STATE_NOTIFIER) += state_notifier.o diff --git a/drivers/soc/qcom/state_notifier.c b/drivers/soc/qcom/state_notifier.c new file mode 100644 index 000000000..93e83ee1c --- /dev/null +++ b/drivers/soc/qcom/state_notifier.c @@ -0,0 +1,129 @@ +/* + * State Notifier Driver + * + * Copyright (c) 2013-2016, Pranav Vashi + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + */ + +#include +#include +#include + +#define DEFAULT_SUSPEND_DEFER_TIME 10 +#define STATE_NOTIFIER "state_notifier" + +/* + * debug = 1 will print all + */ +static unsigned int debug; +module_param_named(debug_mask, debug, uint, 0644); + +#define dprintk(msg...) \ +do { \ + if (debug) \ + pr_info(msg); \ +} while (0) + +static bool enabled; +module_param_named(enabled, enabled, bool, 0664); +static unsigned int suspend_defer_time = DEFAULT_SUSPEND_DEFER_TIME; +module_param_named(suspend_defer_time, suspend_defer_time, uint, 0664); +static struct delayed_work suspend_work; +static struct workqueue_struct *susp_wq; +struct work_struct resume_work; +bool state_suspended; +module_param_named(state_suspended, state_suspended, bool, 0444); +static bool suspend_in_progress; + +static BLOCKING_NOTIFIER_HEAD(state_notifier_list); + +/** + * state_register_client - register a client notifier + * @nb: notifier block to callback on events + */ +int state_register_client(struct notifier_block *nb) +{ + return blocking_notifier_chain_register(&state_notifier_list, nb); +} +EXPORT_SYMBOL(state_register_client); + +/** + * state_unregister_client - unregister a client notifier + * @nb: notifier block to callback on events + */ +int state_unregister_client(struct notifier_block *nb) +{ + return blocking_notifier_chain_unregister(&state_notifier_list, nb); +} +EXPORT_SYMBOL(state_unregister_client); + +/** + * state_notifier_call_chain - notify clients on state_events + * @val: Value passed unmodified to notifier function + * @v: pointer passed unmodified to notifier function + * + */ +int state_notifier_call_chain(unsigned long val, void *v) +{ + return blocking_notifier_call_chain(&state_notifier_list, val, v); +} +EXPORT_SYMBOL_GPL(state_notifier_call_chain); + +static void _suspend_work(struct work_struct *work) +{ + state_suspended = true; + state_notifier_call_chain(STATE_NOTIFIER_SUSPEND, NULL); + suspend_in_progress = false; + dprintk("%s: suspend completed.\n", STATE_NOTIFIER); +} + +static void _resume_work(struct work_struct *work) +{ + state_suspended = false; + state_notifier_call_chain(STATE_NOTIFIER_ACTIVE, NULL); + dprintk("%s: resume completed.\n", STATE_NOTIFIER); +} + +void state_suspend(void) +{ + dprintk("%s: suspend called.\n", STATE_NOTIFIER); + if (state_suspended || suspend_in_progress || !enabled) + return; + + suspend_in_progress = true; + + queue_delayed_work_on(0, susp_wq, &suspend_work, + msecs_to_jiffies(suspend_defer_time * 1000)); +} + +void state_resume(void) +{ + dprintk("%s: resume called.\n", STATE_NOTIFIER); + cancel_delayed_work_sync(&suspend_work); + suspend_in_progress = false; + + if (state_suspended) + queue_work_on(0, susp_wq, &resume_work); +} + +static int __init state_notifier_init(void) +{ + susp_wq = create_singlethread_workqueue("state_susp_wq"); + if (!susp_wq) + pr_err("State Notifier failed to allocate suspend workqueue\n"); + + INIT_DELAYED_WORK(&suspend_work, _suspend_work); + INIT_WORK(&resume_work, _resume_work); + + return 0; +} + +subsys_initcall(state_notifier_init); + +MODULE_AUTHOR("Pranav Vashi "); +MODULE_DESCRIPTION("State Notifier Driver"); +MODULE_LICENSE("GPLv2"); diff --git a/include/linux/state_notifier.h b/include/linux/state_notifier.h new file mode 100644 index 000000000..ffb4fba75 --- /dev/null +++ b/include/linux/state_notifier.h @@ -0,0 +1,20 @@ +#ifndef __LINUX_STATE_NOTIFIER_H +#define __LINUX_STATE_NOTIFIER_H + +#include + +#define STATE_NOTIFIER_ACTIVE 0x01 +#define STATE_NOTIFIER_SUSPEND 0x02 + +struct state_event { + void *data; +}; + +extern bool state_suspended; +extern void state_suspend(void); +extern void state_resume(void); +int state_register_client(struct notifier_block *nb); +int state_unregister_client(struct notifier_block *nb); +int state_notifier_call_chain(unsigned long val, void *v); + +#endif /* _LINUX_STATE_NOTIFIER_H */ From 31bf8787db226074cbad5264cd177a49ed5a9040 Mon Sep 17 00:00:00 2001 From: Joe Maples Date: Wed, 10 May 2017 20:20:23 -0500 Subject: [PATCH 102/146] state_notifier: Enable by default Signed-off-by: Joe Maples --- drivers/soc/qcom/state_notifier.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/soc/qcom/state_notifier.c b/drivers/soc/qcom/state_notifier.c index 93e83ee1c..ff63465f9 100644 --- a/drivers/soc/qcom/state_notifier.c +++ b/drivers/soc/qcom/state_notifier.c @@ -28,7 +28,7 @@ do { \ pr_info(msg); \ } while (0) -static bool enabled; +static bool enabled = true; module_param_named(enabled, enabled, bool, 0664); static unsigned int suspend_defer_time = DEFAULT_SUSPEND_DEFER_TIME; module_param_named(suspend_defer_time, suspend_defer_time, uint, 0664); From 5eaa785cf337f4a5820c5722f55986f56fde5997 Mon Sep 17 00:00:00 2001 From: Joe Maples Date: Wed, 10 May 2017 21:44:21 -0500 Subject: [PATCH 103/146] state_notifier: Queue work on any core Signed-off-by: Joe Maples --- drivers/soc/qcom/state_notifier.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/drivers/soc/qcom/state_notifier.c b/drivers/soc/qcom/state_notifier.c index ff63465f9..e6faf064c 100644 --- a/drivers/soc/qcom/state_notifier.c +++ b/drivers/soc/qcom/state_notifier.c @@ -2,6 +2,7 @@ * State Notifier Driver * * Copyright (c) 2013-2016, Pranav Vashi + * (c) 2017, Joe Maples * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License version 2 as @@ -96,7 +97,7 @@ void state_suspend(void) suspend_in_progress = true; - queue_delayed_work_on(0, susp_wq, &suspend_work, + queue_delayed_work(susp_wq, &suspend_work, msecs_to_jiffies(suspend_defer_time * 1000)); } @@ -107,7 +108,7 @@ void state_resume(void) suspend_in_progress = false; if (state_suspended) - queue_work_on(0, susp_wq, &resume_work); + queue_work(susp_wq, &resume_work); } static int __init state_notifier_init(void) From 8a481594a887a84c65cf38dad032e96758ede665 Mon Sep 17 00:00:00 2001 From: Joe Maples Date: Wed, 10 May 2017 21:45:20 -0500 Subject: [PATCH 104/146] state_notifier: Reduce defer on suspend call to 1 second Signed-off-by: Joe Maples --- drivers/soc/qcom/state_notifier.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/soc/qcom/state_notifier.c b/drivers/soc/qcom/state_notifier.c index e6faf064c..c3139c15b 100644 --- a/drivers/soc/qcom/state_notifier.c +++ b/drivers/soc/qcom/state_notifier.c @@ -14,7 +14,7 @@ #include #include -#define DEFAULT_SUSPEND_DEFER_TIME 10 +#define DEFAULT_SUSPEND_DEFER_TIME 1 #define STATE_NOTIFIER "state_notifier" /* From f084f73010457124436a3cb6af01163b998e4cb0 Mon Sep 17 00:00:00 2001 From: Pranav Vashi Date: Mon, 19 Jun 2017 18:55:05 +0530 Subject: [PATCH 105/146] state_notifier: Make workqueues unbound Signed-off-by: Pranav Vashi Signed-off-by: Joe Maples --- drivers/soc/qcom/state_notifier.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/drivers/soc/qcom/state_notifier.c b/drivers/soc/qcom/state_notifier.c index c3139c15b..d975642f8 100644 --- a/drivers/soc/qcom/state_notifier.c +++ b/drivers/soc/qcom/state_notifier.c @@ -1,7 +1,7 @@ /* * State Notifier Driver * - * Copyright (c) 2013-2016, Pranav Vashi + * Copyright (c) 2013-2017, Pranav Vashi * (c) 2017, Joe Maples * * This program is free software; you can redistribute it and/or modify @@ -113,7 +113,10 @@ void state_resume(void) static int __init state_notifier_init(void) { - susp_wq = create_singlethread_workqueue("state_susp_wq"); + susp_wq = + alloc_workqueue("state_susp_wq", + WQ_HIGHPRI | WQ_UNBOUND | WQ_MEM_RECLAIM, 0); + if (!susp_wq) pr_err("State Notifier failed to allocate suspend workqueue\n"); From af6e6e1571095af416f4cffd53e7af8801fd1751 Mon Sep 17 00:00:00 2001 From: UchihaDev Date: Tue, 1 May 2018 20:51:57 +0545 Subject: [PATCH 106/146] Revert "add O3 clangs" This reverts commit 85bde49d9cba8f57d0851ad0b10478e20cab4800. --- Makefile | 4 ---- 1 file changed, 4 deletions(-) diff --git a/Makefile b/Makefile index 5a9e77878..49951b854 100644 --- a/Makefile +++ b/Makefile @@ -674,16 +674,12 @@ KBUILD_CFLAGS += $(call cc-disable-warning, int-in-bool-context) ifdef CONFIG_CC_OPTIMIZE_FOR_SIZE KBUILD_CFLAGS += -Os else -ifeq ($(cc-name),clang) -KBUILD_CFLAGS += -O3 -else ifdef CONFIG_PROFILE_ALL_BRANCHES KBUILD_CFLAGS += -O2 else KBUILD_CFLAGS += -O2 endif endif -endif # Tell gcc to never replace conditional load with a non-conditional one KBUILD_CFLAGS += $(call cc-option,--param=allow-store-data-races=0) From f3b7882de8c51183df192cd06f5cc11e01c1ed50 Mon Sep 17 00:00:00 2001 From: UchihaDev Date: Tue, 1 May 2018 20:52:09 +0545 Subject: [PATCH 107/146] Revert "Add O3 optimization" This reverts commit 664595eb6211640d22efb1e8e60d8f57401a83e6. --- Makefile | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/Makefile b/Makefile index 49951b854..98641030c 100644 --- a/Makefile +++ b/Makefile @@ -304,11 +304,10 @@ CONFIG_SHELL := $(shell if [ -x "$$BASH" ]; then echo $$BASH; \ else if [ -x /bin/bash ]; then echo /bin/bash; \ else echo sh; fi ; fi) -GRAPHITE = -fgraphite -fgraphite-identity -floop-interchange -ftree-loop-distribution -floop-strip-mine -floop-block -ftree-loop-linear HOSTCC = gcc HOSTCXX = g++ -HOSTCFLAGS = -Wall -Wmissing-prototypes -Wstrict-prototypes -O3 -fomit-frame-pointer $(GRAPHITE) -HOSTCXXFLAGS = -O3 $(GRAPHITE) +HOSTCFLAGS = -Wall -Wmissing-prototypes -Wstrict-prototypes -O2 -fomit-frame-pointer -std=gnu89 +HOSTCXXFLAGS = -O2 ifeq ($(shell $(HOSTCC) -v 2>&1 | grep -c "clang version"), 1) HOSTCFLAGS += -Wno-unused-value -Wno-unused-parameter \ @@ -377,7 +376,6 @@ AFLAGS_KERNEL = CFLAGS_GCOV = -fprofile-arcs -ftest-coverage -fno-tree-loop-im CFLAGS_KCOV = -fsanitize-coverage=trace-pc -KERNELFLAGS = $(GRAPHITE) # Use USERINCLUDE when you must reference the UAPI directories only. USERINCLUDE := \ From f50fef005dcdc8622d03dfd2391b90ab4a688609 Mon Sep 17 00:00:00 2001 From: UchihaDev Date: Tue, 1 May 2018 20:53:27 +0545 Subject: [PATCH 108/146] Revert "more optimizations" This reverts commit f7cdff23af2d67377337ffdb90f7b60667416a3c. --- Makefile | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Makefile b/Makefile index 98641030c..0e674168a 100644 --- a/Makefile +++ b/Makefile @@ -418,8 +418,8 @@ KBUILD_CFLAGS := -Wall -Wundef -Wstrict-prototypes -Wno-trigraphs \ -fno-strict-aliasing -fno-common \ -Werror-implicit-function-declaration \ -Wno-format-security \ - -std=gnu89 $(call cc-option,-fno-PIE) \ - -mcpu=cortex-a53 -mtune=cortex-a53 + -std=gnu89 $(call cc-option,-fno-PIE) + KBUILD_AFLAGS_KERNEL := KBUILD_CFLAGS_KERNEL := From 196018ce2a904ad696dc5546ba67ac5104b502d1 Mon Sep 17 00:00:00 2001 From: acuicultor Date: Sun, 29 Apr 2018 18:22:28 +0200 Subject: [PATCH 109/146] Makefile: add build tweaks --- Makefile | 65 ++++++++++++++++++++++++++++---------------------------- 1 file changed, 32 insertions(+), 33 deletions(-) diff --git a/Makefile b/Makefile index 0e674168a..48c9f7116 100644 --- a/Makefile +++ b/Makefile @@ -257,7 +257,7 @@ SUBARCH := $(shell uname -m | sed -e s/i.86/x86/ -e s/x86_64/x86/ \ # "make" in the configured kernel build directory always uses that. # Default value for CROSS_COMPILE is not to prefix executables # Note: Some architectures assign CROSS_COMPILE in their arch/*/Makefile -ARCH ?= $(SUBARCH) +ARCH ?= arm64 CROSS_COMPILE ?= $(CONFIG_CROSS_COMPILE:"%"=%) # Architecture as present in compile.h @@ -349,11 +349,11 @@ scripts/Kbuild.include: ; include scripts/Kbuild.include # Make variables (CC, etc...) -AS = $(SOURCEANALYZER) $(CROSS_COMPILE)as -LD = $(SOURCEANALYZER) $(CROSS_COMPILE)ld -CC = $(SOURCEANALYZER) $(CCACHE) $(CROSS_COMPILE)gcc +AS = $(CROSS_COMPILE)as +LD = $(CROSS_COMPILE)ld +CC = $(CROSS_COMPILE)gcc CPP = $(CC) -E -AR = $(SOURCEANALYZER) $(CROSS_COMPILE)ar +AR = $(CROSS_COMPILE)ar NM = $(CROSS_COMPILE)nm STRIP = $(CROSS_COMPILE)strip OBJCOPY = $(CROSS_COMPILE)objcopy @@ -370,7 +370,7 @@ CHECKFLAGS := -D__linux__ -Dlinux -D__STDC__ -Dunix -D__unix__ \ -Wbitwise -Wno-return-void $(CF) -Wall CFLAGS_MODULE = AFLAGS_MODULE = -LDFLAGS_MODULE = +LDFLAGS_MODULE = --strip-debug CFLAGS_KERNEL = AFLAGS_KERNEL = CFLAGS_GCOV = -fprofile-arcs -ftest-coverage -fno-tree-loop-im @@ -420,7 +420,6 @@ KBUILD_CFLAGS := -Wall -Wundef -Wstrict-prototypes -Wno-trigraphs \ -Wno-format-security \ -std=gnu89 $(call cc-option,-fno-PIE) - KBUILD_AFLAGS_KERNEL := KBUILD_CFLAGS_KERNEL := KBUILD_AFLAGS := -D__ASSEMBLY__ $(call cc-option,-fno-PIE) @@ -698,9 +697,9 @@ KBUILD_CFLAGS += $(call cc-option,-fno-reorder-blocks,) \ $(call cc-option,-fno-partial-inlining) endif -ifneq ($(CONFIG_FRAME_WARN),0) -KBUILD_CFLAGS += $(call cc-option,-Wframe-larger-than=${CONFIG_FRAME_WARN}) -endif +#ifneq ($(CONFIG_FRAME_WARN),0) +#KBUILD_CFLAGS += $(call cc-option,-Wframe-larger-than=${CONFIG_FRAME_WARN}) +#endif # Handle stack protector mode. # @@ -767,18 +766,18 @@ KBUILD_CFLAGS += $(call cc-disable-warning, unused-but-set-variable) KBUILD_CFLAGS += $(call cc-disable-warning, unused-const-variable) endif -ifdef CONFIG_FRAME_POINTER -KBUILD_CFLAGS += -fno-omit-frame-pointer -fno-optimize-sibling-calls -else +#ifdef CONFIG_FRAME_POINTER +#KBUILD_CFLAGS += -fno-omit-frame-pointer -fno-optimize-sibling-calls +#else # Some targets (ARM with Thumb2, for example), can't be built with frame # pointers. For those, we don't have FUNCTION_TRACER automatically # select FRAME_POINTER. However, FUNCTION_TRACER adds -pg, and this is # incompatible with -fomit-frame-pointer with current GCC, so we don't use # -fomit-frame-pointer with FUNCTION_TRACER. -ifndef CONFIG_FUNCTION_TRACER +#ifndef CONFIG_FUNCTION_TRACER KBUILD_CFLAGS += -fomit-frame-pointer -endif -endif +#endif +#endif KBUILD_CFLAGS += $(call cc-option, -fno-var-tracking-assignments) @@ -799,23 +798,23 @@ KBUILD_CFLAGS += $(call cc-option, -femit-struct-debug-baseonly) \ $(call cc-option,-fno-var-tracking) endif -ifdef CONFIG_FUNCTION_TRACER -ifndef CC_FLAGS_FTRACE -CC_FLAGS_FTRACE := -pg -endif -export CC_FLAGS_FTRACE -ifdef CONFIG_HAVE_FENTRY -CC_USING_FENTRY := $(call cc-option, -mfentry -DCC_USING_FENTRY) -endif -KBUILD_CFLAGS += $(CC_FLAGS_FTRACE) $(CC_USING_FENTRY) -KBUILD_AFLAGS += $(CC_USING_FENTRY) -ifdef CONFIG_DYNAMIC_FTRACE - ifdef CONFIG_HAVE_C_RECORDMCOUNT - BUILD_C_RECORDMCOUNT := y - export BUILD_C_RECORDMCOUNT - endif -endif -endif +#ifdef CONFIG_FUNCTION_TRACER +#ifndef CC_FLAGS_FTRACE +#CC_FLAGS_FTRACE := -pg +#endif +#export CC_FLAGS_FTRACE +#ifdef CONFIG_HAVE_FENTRY +#CC_USING_FENTRY := $(call cc-option, -mfentry -DCC_USING_FENTRY) +#endif +#KBUILD_CFLAGS += $(CC_FLAGS_FTRACE) $(CC_USING_FENTRY) +#KBUILD_AFLAGS += $(CC_USING_FENTRY) +#ifdef CONFIG_DYNAMIC_FTRACE +# ifdef CONFIG_HAVE_C_RECORDMCOUNT +# BUILD_C_RECORDMCOUNT := y +# export BUILD_C_RECORDMCOUNT +# endif +#endif +#endif # We trigger additional mismatches with less inlining ifdef CONFIG_DEBUG_SECTION_MISMATCH From 43aeaa6613d184e649766c70c50fb8f15fecbad4 Mon Sep 17 00:00:00 2001 From: Sultanxda Date: Sun, 4 Feb 2018 02:26:33 -0800 Subject: [PATCH 110/146] net: sch_generic: Remove unnecessary watchdog warning We're never going to debug these ugly warnings. Signed-off-by: Sultanxda Signed-off-by: engstk --- net/sched/sch_generic.c | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c index aa4725038..8ba8a611d 100644 --- a/net/sched/sch_generic.c +++ b/net/sched/sch_generic.c @@ -301,11 +301,8 @@ static void dev_watchdog(unsigned long arg) } } - if (some_queue_timedout) { - WARN_ONCE(1, KERN_INFO "NETDEV WATCHDOG: %s (%s): transmit queue %u timed out\n", - dev->name, netdev_drivername(dev), i); + if (some_queue_timedout) dev->netdev_ops->ndo_tx_timeout(dev); - } if (!mod_timer(&dev->watchdog_timer, round_jiffies(jiffies + dev->watchdog_timeo))) From 2849c956262860c5c34f9d839665466730ab1b6e Mon Sep 17 00:00:00 2001 From: savoca Date: Fri, 27 Mar 2015 06:12:15 +0000 Subject: [PATCH 111/146] staging: Add frandom RNG driver - In staging until checkpatch.pl errors are cleared - Full credit to Eli Billauer: http://www.billauer.co.il/frandom.html urandom: 1048576 bytes transferred in 0.280 secs (3744914 bytes/sec) 10485760 bytes transferred in 2.521 secs (4159365 bytes/sec) 104857600 bytes transferred in 24.855 secs (4218772 bytes/sec) frandom: 1048576 bytes transferred in 0.030 secs (34952533 bytes/sec) 10485760 bytes transferred in 0.224 secs (46811428 bytes/sec) 104857600 bytes transferred in 1.921 secs (54584903 bytes/sec) erandom: 1048576 bytes transferred in 0.026 secs (40329846 bytes/sec) 10485760 bytes transferred in 0.221 secs (47446877 bytes/sec) 104857600 bytes transferred in 1.911 secs (54870538 bytes/sec) Signed-off-by: Ryan Andri --- drivers/staging/Kconfig | 2 + drivers/staging/Makefile | 2 + drivers/staging/frandom/Kconfig | 14 + drivers/staging/frandom/Makefile | 1 + drivers/staging/frandom/frandom.c | 415 ++++++++++++++++++++++++++++++ 5 files changed, 434 insertions(+) create mode 100644 drivers/staging/frandom/Kconfig create mode 100644 drivers/staging/frandom/Makefile create mode 100644 drivers/staging/frandom/frandom.c diff --git a/drivers/staging/Kconfig b/drivers/staging/Kconfig index 5d3b86a33..05b16f375 100644 --- a/drivers/staging/Kconfig +++ b/drivers/staging/Kconfig @@ -110,4 +110,6 @@ source "drivers/staging/wilc1000/Kconfig" source "drivers/staging/most/Kconfig" +source "drivers/staging/frandom/Kconfig" + endif # STAGING diff --git a/drivers/staging/Makefile b/drivers/staging/Makefile index 30918edef..7efef9b7c 100644 --- a/drivers/staging/Makefile +++ b/drivers/staging/Makefile @@ -47,3 +47,5 @@ obj-$(CONFIG_FB_TFT) += fbtft/ obj-$(CONFIG_FSL_MC_BUS) += fsl-mc/ obj-$(CONFIG_WILC1000) += wilc1000/ obj-$(CONFIG_MOST) += most/ + +obj-$(CONFIG_FRANDOM) += frandom/ diff --git a/drivers/staging/frandom/Kconfig b/drivers/staging/frandom/Kconfig new file mode 100644 index 000000000..57529dded --- /dev/null +++ b/drivers/staging/frandom/Kconfig @@ -0,0 +1,14 @@ +config FRANDOM + tristate "Frandom RNG driver" + help + Frandom is a Linux kernel random number generator, which + is 10-50 times faster than what you get from Linux' built-in + /dev/urandom. And it uses very little (/dev/frandom) or none + (/dev/erandom) of the kernel's entropy pool, so it is very + useful for applications that require a handy source for lots + of random data. + + http://www.billauer.co.il/frandom.html + + If unsure here, select N. + diff --git a/drivers/staging/frandom/Makefile b/drivers/staging/frandom/Makefile new file mode 100644 index 000000000..5a4ff4ef9 --- /dev/null +++ b/drivers/staging/frandom/Makefile @@ -0,0 +1 @@ +obj-$(CONFIG_FRANDOM) += frandom.o diff --git a/drivers/staging/frandom/frandom.c b/drivers/staging/frandom/frandom.c new file mode 100644 index 000000000..54b98c53e --- /dev/null +++ b/drivers/staging/frandom/frandom.c @@ -0,0 +1,415 @@ +/* +** frandom.c +** Fast pseudo-random generator +** +** (c) Copyright 2003-2011 Eli Billauer +** http://www.billauer.co.il +** +** This program is free software; you can redistribute it and/or modify +** it under the terms of the GNU General Public License as published by +** the Free Software Foundation; either version 2 of the License, or +** (at your option) any later version. +** +** +*/ + +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include + +#define INTERNAL_SEED 0 +#define EXTERNAL_SEED 1 + +#define FRANDOM_MAJOR 235 +#define FRANDOM_MINOR 11 +#define ERANDOM_MINOR 12 + +static struct file_operations frandom_fops; /* Values assigned below */ + +static int erandom_seeded = 0; /* Internal flag */ + +static int frandom_major = FRANDOM_MAJOR; +static int frandom_minor = FRANDOM_MINOR; +static int erandom_minor = ERANDOM_MINOR; +static int frandom_bufsize = 256; +static int frandom_chunklimit = 0; /* =0 means unlimited */ + +static struct cdev frandom_cdev; +static struct cdev erandom_cdev; +static struct class *frandom_class; +struct device *frandom_device; +struct device *erandom_device; + +MODULE_DESCRIPTION("Fast pseudo-random number generator"); +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Eli Billauer"); +module_param(frandom_major, int, 0); +module_param(frandom_minor, int, 0); +module_param(erandom_minor, int, 0); +module_param(frandom_bufsize, int, 0); +module_param(frandom_chunklimit, int, 0); + +MODULE_PARM_DESC(frandom_major,"Major number of /dev/frandom and /dev/erandom"); +MODULE_PARM_DESC(frandom_minor,"Minor number of /dev/frandom"); +MODULE_PARM_DESC(erandom_minor,"Minor number of /dev/erandom"); +MODULE_PARM_DESC(frandom_bufsize,"Internal buffer size in bytes. Default is 256. Must be >= 256"); +MODULE_PARM_DESC(frandom_chunklimit,"Limit for read() blocks size. 0 (default) is unlimited, otherwise must be >= 256"); + +struct frandom_state +{ + struct semaphore sem; /* Semaphore on the state structure */ + + u8 S[256]; /* The state array */ + u8 i; + u8 j; + + char *buf; +}; + +static struct frandom_state *erandom_state; + +static inline void swap_byte(u8 *a, u8 *b) +{ + u8 swapByte; + + swapByte = *a; + *a = *b; + *b = swapByte; +} + +static void init_rand_state(struct frandom_state *state, int seedflag); + +void erandom_get_random_bytes(char *buf, size_t count) +{ + struct frandom_state *state = erandom_state; + int k; + + unsigned int i; + unsigned int j; + u8 *S; + + /* If we fail to get the semaphore, we revert to external random data. + Since semaphore blocking is expected to be very rare, and interrupts + during these rare and very short periods of time even less frequent, + we take the better-safe-than-sorry approach, and fill the buffer + some expensive random data, in case the caller wasn't aware of this + possibility, and expects random data anyhow. + */ + + if (down_interruptible(&state->sem)) { + get_random_bytes(buf, count); + return; + } + + /* We seed erandom as late as possible, hoping that the kernel's main + RNG is already restored in the boot sequence (not critical, but + better. + */ + + if (!erandom_seeded) { + erandom_seeded = 1; + init_rand_state(state, EXTERNAL_SEED); + printk(KERN_INFO "frandom: Seeded global generator now (used by erandom)\n"); + } + + i = state->i; + j = state->j; + S = state->S; + + for (k=0; ki = i; + state->j = j; + + up(&state->sem); +} + +static void init_rand_state(struct frandom_state *state, int seedflag) +{ + unsigned int i, j, k; + u8 *S; + u8 *seed = state->buf; + + if (seedflag == INTERNAL_SEED) + erandom_get_random_bytes(seed, 256); + else + get_random_bytes(seed, 256); + + S = state->S; + for (i=0; i<256; i++) + *S++=i; + + j=0; + S = state->S; + + for (i=0; i<256; i++) { + j = (j + S[i] + *seed++) & 0xff; + swap_byte(&S[i], &S[j]); + } + + /* It's considered good practice to discard the first 256 bytes + generated. So we do it: + */ + + i=0; j=0; + for (k=0; k<256; k++) { + i = (i + 1) & 0xff; + j = (j + S[i]) & 0xff; + swap_byte(&S[i], &S[j]); + } + + state->i = i; /* Save state */ + state->j = j; +} + +static int frandom_open(struct inode *inode, struct file *filp) +{ + + struct frandom_state *state; + + int num = iminor(inode); + + /* This should never happen, now when the minors are regsitered + * explicitly + */ + if ((num != frandom_minor) && (num != erandom_minor)) return -ENODEV; + + state = kmalloc(sizeof(struct frandom_state), GFP_KERNEL); + if (!state) + return -ENOMEM; + + state->buf = kmalloc(frandom_bufsize, GFP_KERNEL); + if (!state->buf) { + kfree(state); + return -ENOMEM; + } + + sema_init(&state->sem, 1); /* Init semaphore as a mutex */ + + if (num == frandom_minor) + init_rand_state(state, EXTERNAL_SEED); + else + init_rand_state(state, INTERNAL_SEED); + + filp->private_data = state; + + return 0; /* Success */ +} + +static int frandom_release(struct inode *inode, struct file *filp) +{ + + struct frandom_state *state = filp->private_data; + + kfree(state->buf); + kfree(state); + + return 0; +} + +static ssize_t frandom_read(struct file *filp, char *buf, size_t count, + loff_t *f_pos) +{ + struct frandom_state *state = filp->private_data; + ssize_t ret; + int dobytes, k; + char *localbuf; + + unsigned int i; + unsigned int j; + u8 *S; + + if (down_interruptible(&state->sem)) + return -ERESTARTSYS; + + if ((frandom_chunklimit > 0) && (count > frandom_chunklimit)) + count = frandom_chunklimit; + + ret = count; /* It's either everything or an error... */ + + i = state->i; + j = state->j; + S = state->S; + + while (count) { + if (count > frandom_bufsize) + dobytes = frandom_bufsize; + else + dobytes = count; + + localbuf = state->buf; + + for (k=0; kbuf, dobytes)) { + ret = -EFAULT; + goto out; + } + + buf += dobytes; + count -= dobytes; + } + + out: + state->i = i; + state->j = j; + + up(&state->sem); + return ret; +} + +static struct file_operations frandom_fops = { + read: frandom_read, + open: frandom_open, + release: frandom_release, +}; + +static void frandom_cleanup_module(void) { + unregister_chrdev_region(MKDEV(frandom_major, erandom_minor), 1); + cdev_del(&erandom_cdev); + device_destroy(frandom_class, MKDEV(frandom_major, erandom_minor)); + + unregister_chrdev_region(MKDEV(frandom_major, frandom_minor), 1); + cdev_del(&frandom_cdev); + device_destroy(frandom_class, MKDEV(frandom_major, frandom_minor)); + class_destroy(frandom_class); + + kfree(erandom_state->buf); + kfree(erandom_state); +} + + +static int frandom_init_module(void) +{ + int result; + + /* The buffer size MUST be at least 256 bytes, because we assume that + minimal length in init_rand_state(). + */ + if (frandom_bufsize < 256) { + printk(KERN_ERR "frandom: Refused to load because frandom_bufsize=%d < 256\n",frandom_bufsize); + return -EINVAL; + } + if ((frandom_chunklimit != 0) && (frandom_chunklimit < 256)) { + printk(KERN_ERR "frandom: Refused to load because frandom_chunklimit=%d < 256 and != 0\n",frandom_chunklimit); + return -EINVAL; + } + + erandom_state = kmalloc(sizeof(struct frandom_state), GFP_KERNEL); + if (!erandom_state) + return -ENOMEM; + + /* This specific buffer is only used for seeding, so we need + 256 bytes exactly */ + erandom_state->buf = kmalloc(256, GFP_KERNEL); + if (!erandom_state->buf) { + kfree(erandom_state); + return -ENOMEM; + } + + sema_init(&erandom_state->sem, 1); /* Init semaphore as a mutex */ + + erandom_seeded = 0; + + frandom_class = class_create(THIS_MODULE, "fastrng"); + if (IS_ERR(frandom_class)) { + result = PTR_ERR(frandom_class); + printk(KERN_WARNING "frandom: Failed to register class fastrng\n"); + goto error0; + } + + /* + * Register your major, and accept a dynamic number. This is the + * first thing to do, in order to avoid releasing other module's + * fops in frandom_cleanup_module() + */ + + cdev_init(&frandom_cdev, &frandom_fops); + frandom_cdev.owner = THIS_MODULE; + result = cdev_add(&frandom_cdev, MKDEV(frandom_major, frandom_minor), 1); + if (result) { + printk(KERN_WARNING "frandom: Failed to add cdev for /dev/frandom\n"); + goto error1; + } + + result = register_chrdev_region(MKDEV(frandom_major, frandom_minor), 1, "/dev/frandom"); + if (result < 0) { + printk(KERN_WARNING "frandom: can't get major/minor %d/%d\n", frandom_major, frandom_minor); + goto error2; + } + + frandom_device = device_create(frandom_class, NULL, MKDEV(frandom_major, frandom_minor), NULL, "frandom"); + + if (IS_ERR(frandom_device)) { + printk(KERN_WARNING "frandom: Failed to create frandom device\n"); + goto error3; + } + + cdev_init(&erandom_cdev, &frandom_fops); + erandom_cdev.owner = THIS_MODULE; + result = cdev_add(&erandom_cdev, MKDEV(frandom_major, erandom_minor), 1); + if (result) { + printk(KERN_WARNING "frandom: Failed to add cdev for /dev/erandom\n"); + goto error4; + } + + result = register_chrdev_region(MKDEV(frandom_major, erandom_minor), 1, "/dev/erandom"); + if (result < 0) { + printk(KERN_WARNING "frandom: can't get major/minor %d/%d\n", frandom_major, erandom_minor); + goto error5; + } + + erandom_device = device_create(frandom_class, NULL, MKDEV(frandom_major, erandom_minor), NULL, "erandom"); + + if (IS_ERR(erandom_device)) { + printk(KERN_WARNING "frandom: Failed to create erandom device\n"); + goto error6; + } + return 0; /* succeed */ + + error6: + unregister_chrdev_region(MKDEV(frandom_major, erandom_minor), 1); + error5: + cdev_del(&erandom_cdev); + error4: + device_destroy(frandom_class, MKDEV(frandom_major, frandom_minor)); + error3: + unregister_chrdev_region(MKDEV(frandom_major, frandom_minor), 1); + error2: + cdev_del(&frandom_cdev); + error1: + class_destroy(frandom_class); + error0: + kfree(erandom_state->buf); + kfree(erandom_state); + + return result; +} + +module_init(frandom_init_module); +module_exit(frandom_cleanup_module); + +EXPORT_SYMBOL(erandom_get_random_bytes); From bad1137a8c449db912480a7e06eeeb59e04bf246 Mon Sep 17 00:00:00 2001 From: savoca Date: Sun, 29 Mar 2015 08:22:30 -0400 Subject: [PATCH 112/146] staging: frandom: Dynamically allocate the char device numbers - This will fix issues where devices weren't working 'out of the box' Signed-off-by: Ryan Andri --- drivers/staging/frandom/frandom.c | 80 +++++++++++++------------------ 1 file changed, 33 insertions(+), 47 deletions(-) diff --git a/drivers/staging/frandom/frandom.c b/drivers/staging/frandom/frandom.c index 54b98c53e..e95809052 100644 --- a/drivers/staging/frandom/frandom.c +++ b/drivers/staging/frandom/frandom.c @@ -33,17 +33,16 @@ #define INTERNAL_SEED 0 #define EXTERNAL_SEED 1 -#define FRANDOM_MAJOR 235 -#define FRANDOM_MINOR 11 -#define ERANDOM_MINOR 12 +#define NR_FRANDOM_DEVS 2 static struct file_operations frandom_fops; /* Values assigned below */ static int erandom_seeded = 0; /* Internal flag */ -static int frandom_major = FRANDOM_MAJOR; -static int frandom_minor = FRANDOM_MINOR; -static int erandom_minor = ERANDOM_MINOR; +static dev_t frandom_devt; +static dev_t erandom_devt; +static int frandom_minor; +static int erandom_minor; static int frandom_bufsize = 256; static int frandom_chunklimit = 0; /* =0 means unlimited */ @@ -56,15 +55,9 @@ struct device *erandom_device; MODULE_DESCRIPTION("Fast pseudo-random number generator"); MODULE_LICENSE("GPL"); MODULE_AUTHOR("Eli Billauer"); -module_param(frandom_major, int, 0); -module_param(frandom_minor, int, 0); -module_param(erandom_minor, int, 0); module_param(frandom_bufsize, int, 0); module_param(frandom_chunklimit, int, 0); -MODULE_PARM_DESC(frandom_major,"Major number of /dev/frandom and /dev/erandom"); -MODULE_PARM_DESC(frandom_minor,"Minor number of /dev/frandom"); -MODULE_PARM_DESC(erandom_minor,"Minor number of /dev/erandom"); MODULE_PARM_DESC(frandom_bufsize,"Internal buffer size in bytes. Default is 256. Must be >= 256"); MODULE_PARM_DESC(frandom_chunklimit,"Limit for read() blocks size. 0 (default) is unlimited, otherwise must be >= 256"); @@ -188,7 +181,7 @@ static int frandom_open(struct inode *inode, struct file *filp) int num = iminor(inode); /* This should never happen, now when the minors are regsitered - * explicitly + * explicitly (or dynamically) */ if ((num != frandom_minor) && (num != erandom_minor)) return -ENODEV; @@ -288,14 +281,11 @@ static struct file_operations frandom_fops = { }; static void frandom_cleanup_module(void) { - unregister_chrdev_region(MKDEV(frandom_major, erandom_minor), 1); + device_destroy(frandom_class, erandom_devt); cdev_del(&erandom_cdev); - device_destroy(frandom_class, MKDEV(frandom_major, erandom_minor)); - - unregister_chrdev_region(MKDEV(frandom_major, frandom_minor), 1); + device_destroy(frandom_class, frandom_devt); cdev_del(&frandom_cdev); - device_destroy(frandom_class, MKDEV(frandom_major, frandom_minor)); - class_destroy(frandom_class); + unregister_chrdev_region(frandom_devt, NR_FRANDOM_DEVS); kfree(erandom_state->buf); kfree(erandom_state); @@ -347,21 +337,25 @@ static int frandom_init_module(void) * fops in frandom_cleanup_module() */ + result = alloc_chrdev_region(&frandom_devt, 0, NR_FRANDOM_DEVS, "frandom"); + if (result < 0) { + printk(KERN_WARNING "frandom: failed to alloc frandom region\n"); + goto error1; + } + + frandom_minor = MINOR(frandom_devt); + erandom_minor = frandom_minor + 1; + erandom_devt = MKDEV(MAJOR(frandom_devt), erandom_minor); + cdev_init(&frandom_cdev, &frandom_fops); frandom_cdev.owner = THIS_MODULE; - result = cdev_add(&frandom_cdev, MKDEV(frandom_major, frandom_minor), 1); + result = cdev_add(&frandom_cdev, frandom_devt, 1); if (result) { printk(KERN_WARNING "frandom: Failed to add cdev for /dev/frandom\n"); - goto error1; - } - - result = register_chrdev_region(MKDEV(frandom_major, frandom_minor), 1, "/dev/frandom"); - if (result < 0) { - printk(KERN_WARNING "frandom: can't get major/minor %d/%d\n", frandom_major, frandom_minor); goto error2; } - frandom_device = device_create(frandom_class, NULL, MKDEV(frandom_major, frandom_minor), NULL, "frandom"); + frandom_device = device_create(frandom_class, NULL, frandom_devt, NULL, "frandom"); if (IS_ERR(frandom_device)) { printk(KERN_WARNING "frandom: Failed to create frandom device\n"); @@ -370,43 +364,35 @@ static int frandom_init_module(void) cdev_init(&erandom_cdev, &frandom_fops); erandom_cdev.owner = THIS_MODULE; - result = cdev_add(&erandom_cdev, MKDEV(frandom_major, erandom_minor), 1); + result = cdev_add(&erandom_cdev, erandom_devt, 1); if (result) { printk(KERN_WARNING "frandom: Failed to add cdev for /dev/erandom\n"); goto error4; } - result = register_chrdev_region(MKDEV(frandom_major, erandom_minor), 1, "/dev/erandom"); - if (result < 0) { - printk(KERN_WARNING "frandom: can't get major/minor %d/%d\n", frandom_major, erandom_minor); - goto error5; - } - - erandom_device = device_create(frandom_class, NULL, MKDEV(frandom_major, erandom_minor), NULL, "erandom"); + erandom_device = device_create(frandom_class, NULL, erandom_devt, NULL, "erandom"); if (IS_ERR(erandom_device)) { printk(KERN_WARNING "frandom: Failed to create erandom device\n"); - goto error6; + goto error5; } return 0; /* succeed */ - error6: - unregister_chrdev_region(MKDEV(frandom_major, erandom_minor), 1); - error5: +error5: cdev_del(&erandom_cdev); - error4: - device_destroy(frandom_class, MKDEV(frandom_major, frandom_minor)); - error3: - unregister_chrdev_region(MKDEV(frandom_major, frandom_minor), 1); - error2: +error4: + device_destroy(frandom_class, frandom_devt); +error3: cdev_del(&frandom_cdev); - error1: +error2: + unregister_chrdev_region(frandom_devt, NR_FRANDOM_DEVS); +error1: class_destroy(frandom_class); - error0: +error0: kfree(erandom_state->buf); kfree(erandom_state); - return result; + return result; } module_init(frandom_init_module); From 5753ff5d3d42819008f2b6a37c168e5ceff601d4 Mon Sep 17 00:00:00 2001 From: savoca Date: Sun, 29 Mar 2015 14:28:34 +0000 Subject: [PATCH 113/146] staging: frandom: Clear up checkpatch conflicts Signed-off-by: Ryan Andri --- drivers/staging/frandom/frandom.c | 142 ++++++++++++++++-------------- 1 file changed, 75 insertions(+), 67 deletions(-) diff --git a/drivers/staging/frandom/frandom.c b/drivers/staging/frandom/frandom.c index e95809052..c6add1eaf 100644 --- a/drivers/staging/frandom/frandom.c +++ b/drivers/staging/frandom/frandom.c @@ -1,6 +1,6 @@ /* ** frandom.c -** Fast pseudo-random generator +** Fast pseudo-random generator ** ** (c) Copyright 2003-2011 Eli Billauer ** http://www.billauer.co.il @@ -19,13 +19,13 @@ #include #include -#include -#include +#include +#include #include -#include +#include #include -#include +#include #include #include #include @@ -35,16 +35,16 @@ #define NR_FRANDOM_DEVS 2 -static struct file_operations frandom_fops; /* Values assigned below */ +static const struct file_operations frandom_fops; /* Values assigned below */ -static int erandom_seeded = 0; /* Internal flag */ +static int erandom_seeded; /* Internal flag */ static dev_t frandom_devt; static dev_t erandom_devt; static int frandom_minor; static int erandom_minor; static int frandom_bufsize = 256; -static int frandom_chunklimit = 0; /* =0 means unlimited */ +static int frandom_chunklimit; /* =0 means unlimited */ static struct cdev frandom_cdev; static struct cdev erandom_cdev; @@ -58,15 +58,17 @@ MODULE_AUTHOR("Eli Billauer"); module_param(frandom_bufsize, int, 0); module_param(frandom_chunklimit, int, 0); -MODULE_PARM_DESC(frandom_bufsize,"Internal buffer size in bytes. Default is 256. Must be >= 256"); -MODULE_PARM_DESC(frandom_chunklimit,"Limit for read() blocks size. 0 (default) is unlimited, otherwise must be >= 256"); +MODULE_PARM_DESC(frandom_bufsize, + "Internal buffer size in bytes. Default is 256. Must be >= 256"); +MODULE_PARM_DESC(frandom_chunklimit, + "Limit for read() blocks size. 0 (default) is unlimited," + "otherwise must be >= 256"); -struct frandom_state -{ +struct frandom_state { struct semaphore sem; /* Semaphore on the state structure */ u8 S[256]; /* The state array */ - u8 i; + u8 i; u8 j; char *buf; @@ -76,10 +78,10 @@ static struct frandom_state *erandom_state; static inline void swap_byte(u8 *a, u8 *b) { - u8 swapByte; - - swapByte = *a; - *a = *b; + u8 swapByte; + + swapByte = *a; + *a = *b; *b = swapByte; } @@ -93,7 +95,7 @@ void erandom_get_random_bytes(char *buf, size_t count) unsigned int i; unsigned int j; u8 *S; - + /* If we fail to get the semaphore, we revert to external random data. Since semaphore blocking is expected to be very rare, and interrupts during these rare and very short periods of time even less frequent, @@ -111,29 +113,30 @@ void erandom_get_random_bytes(char *buf, size_t count) RNG is already restored in the boot sequence (not critical, but better. */ - + if (!erandom_seeded) { erandom_seeded = 1; init_rand_state(state, EXTERNAL_SEED); - printk(KERN_INFO "frandom: Seeded global generator now (used by erandom)\n"); + pr_info("frandom: Seeded global generator now (used by erandom)\n"); } - i = state->i; + i = state->i; j = state->j; - S = state->S; + S = state->S; - for (k=0; ki = i; + + state->i = i; state->j = j; up(&state->sem); } +EXPORT_SYMBOL(erandom_get_random_bytes); static void init_rand_state(struct frandom_state *state, int seedflag) { @@ -147,13 +150,13 @@ static void init_rand_state(struct frandom_state *state, int seedflag) get_random_bytes(seed, 256); S = state->S; - for (i=0; i<256; i++) - *S++=i; + for (i = 0; i < 256; i++) + *S++ = i; - j=0; + j = 0; S = state->S; - for (i=0; i<256; i++) { + for (i = 0; i < 256; i++) { j = (j + S[i] + *seed++) & 0xff; swap_byte(&S[i], &S[j]); } @@ -162,8 +165,8 @@ static void init_rand_state(struct frandom_state *state, int seedflag) generated. So we do it: */ - i=0; j=0; - for (k=0; k<256; k++) { + i = 0; j = 0; + for (k = 0; k < 256; k++) { i = (i + 1) & 0xff; j = (j + S[i]) & 0xff; swap_byte(&S[i], &S[j]); @@ -175,7 +178,7 @@ static void init_rand_state(struct frandom_state *state, int seedflag) static int frandom_open(struct inode *inode, struct file *filp) { - + struct frandom_state *state; int num = iminor(inode); @@ -183,8 +186,9 @@ static int frandom_open(struct inode *inode, struct file *filp) /* This should never happen, now when the minors are regsitered * explicitly (or dynamically) */ - if ((num != frandom_minor) && (num != erandom_minor)) return -ENODEV; - + if ((num != frandom_minor) && (num != erandom_minor)) + return -ENODEV; + state = kmalloc(sizeof(struct frandom_state), GFP_KERNEL); if (!state) return -ENOMEM; @@ -214,7 +218,7 @@ static int frandom_release(struct inode *inode, struct file *filp) kfree(state->buf); kfree(state); - + return 0; } @@ -229,18 +233,18 @@ static ssize_t frandom_read(struct file *filp, char *buf, size_t count, unsigned int i; unsigned int j; u8 *S; - + if (down_interruptible(&state->sem)) return -ERESTARTSYS; - + if ((frandom_chunklimit > 0) && (count > frandom_chunklimit)) count = frandom_chunklimit; ret = count; /* It's either everything or an error... */ - - i = state->i; + + i = state->i; j = state->j; - S = state->S; + S = state->S; while (count) { if (count > frandom_bufsize) @@ -250,13 +254,13 @@ static ssize_t frandom_read(struct file *filp, char *buf, size_t count, localbuf = state->buf; - for (k=0; kbuf, dobytes)) { ret = -EFAULT; goto out; @@ -267,20 +271,21 @@ static ssize_t frandom_read(struct file *filp, char *buf, size_t count, } out: - state->i = i; + state->i = i; state->j = j; up(&state->sem); return ret; } -static struct file_operations frandom_fops = { - read: frandom_read, - open: frandom_open, - release: frandom_release, +static const struct file_operations frandom_fops = { + .read = frandom_read, + .open = frandom_open, + .release = frandom_release, }; -static void frandom_cleanup_module(void) { +static void frandom_cleanup_module(void) +{ device_destroy(frandom_class, erandom_devt); cdev_del(&erandom_cdev); device_destroy(frandom_class, frandom_devt); @@ -298,13 +303,15 @@ static int frandom_init_module(void) /* The buffer size MUST be at least 256 bytes, because we assume that minimal length in init_rand_state(). - */ + */ if (frandom_bufsize < 256) { - printk(KERN_ERR "frandom: Refused to load because frandom_bufsize=%d < 256\n",frandom_bufsize); + pr_err("frandom: Invalid frandom_bufsize: %d\n", + frandom_bufsize); return -EINVAL; } if ((frandom_chunklimit != 0) && (frandom_chunklimit < 256)) { - printk(KERN_ERR "frandom: Refused to load because frandom_chunklimit=%d < 256 and != 0\n",frandom_chunklimit); + pr_err("frandom: Invalid frandom_chunklimit: %d\n", + frandom_chunklimit); return -EINVAL; } @@ -327,20 +334,21 @@ static int frandom_init_module(void) frandom_class = class_create(THIS_MODULE, "fastrng"); if (IS_ERR(frandom_class)) { result = PTR_ERR(frandom_class); - printk(KERN_WARNING "frandom: Failed to register class fastrng\n"); + pr_warn("frandom: Failed to register class fastrng\n"); goto error0; } - + /* * Register your major, and accept a dynamic number. This is the * first thing to do, in order to avoid releasing other module's * fops in frandom_cleanup_module() */ - result = alloc_chrdev_region(&frandom_devt, 0, NR_FRANDOM_DEVS, "frandom"); + result = alloc_chrdev_region(&frandom_devt, 0, NR_FRANDOM_DEVS, + "frandom"); if (result < 0) { - printk(KERN_WARNING "frandom: failed to alloc frandom region\n"); - goto error1; + pr_warn("frandom: failed to alloc frandom region\n"); + goto error1; } frandom_minor = MINOR(frandom_devt); @@ -351,14 +359,15 @@ static int frandom_init_module(void) frandom_cdev.owner = THIS_MODULE; result = cdev_add(&frandom_cdev, frandom_devt, 1); if (result) { - printk(KERN_WARNING "frandom: Failed to add cdev for /dev/frandom\n"); - goto error2; + pr_warn("frandom: Failed to add cdev for /dev/frandom\n"); + goto error2; } - frandom_device = device_create(frandom_class, NULL, frandom_devt, NULL, "frandom"); + frandom_device = device_create(frandom_class, NULL, frandom_devt, + NULL, "frandom"); if (IS_ERR(frandom_device)) { - printk(KERN_WARNING "frandom: Failed to create frandom device\n"); + pr_warn("frandom: Failed to create frandom device\n"); goto error3; } @@ -366,14 +375,15 @@ static int frandom_init_module(void) erandom_cdev.owner = THIS_MODULE; result = cdev_add(&erandom_cdev, erandom_devt, 1); if (result) { - printk(KERN_WARNING "frandom: Failed to add cdev for /dev/erandom\n"); - goto error4; + pr_warn("frandom: Failed to add cdev for /dev/erandom\n"); + goto error4; } - erandom_device = device_create(frandom_class, NULL, erandom_devt, NULL, "erandom"); + erandom_device = device_create(frandom_class, NULL, erandom_devt, + NULL, "erandom"); if (IS_ERR(erandom_device)) { - printk(KERN_WARNING "frandom: Failed to create erandom device\n"); + pr_warn("frandom: Failed to create erandom device\n"); goto error5; } return 0; /* succeed */ @@ -392,10 +402,8 @@ static int frandom_init_module(void) kfree(erandom_state->buf); kfree(erandom_state); - return result; + return result; } module_init(frandom_init_module); module_exit(frandom_cleanup_module); - -EXPORT_SYMBOL(erandom_get_random_bytes); From 5a713edabb86e09dc9006427d6528228ac147221 Mon Sep 17 00:00:00 2001 From: savoca Date: Thu, 2 Apr 2015 16:36:14 +0000 Subject: [PATCH 114/146] drivers: Upgrade frandom from staging Signed-off-by: Ryan Andri --- drivers/char/Kconfig | 14 + drivers/char/frandom.c | 215 ++++++++-------- drivers/staging/Kconfig | 2 - drivers/staging/Makefile | 1 - drivers/staging/frandom/Kconfig | 14 - drivers/staging/frandom/Makefile | 1 - drivers/staging/frandom/frandom.c | 409 ------------------------------ 7 files changed, 116 insertions(+), 540 deletions(-) delete mode 100644 drivers/staging/frandom/Kconfig delete mode 100644 drivers/staging/frandom/Makefile delete mode 100644 drivers/staging/frandom/frandom.c diff --git a/drivers/char/Kconfig b/drivers/char/Kconfig index 22b61fdb0..3e910e224 100644 --- a/drivers/char/Kconfig +++ b/drivers/char/Kconfig @@ -614,5 +614,19 @@ config TILE_SROM source "drivers/char/xillybus/Kconfig" +config FRANDOM + tristate "Frandom RNG driver" + help + Frandom is a Linux kernel random number generator, which + is 10-50 times faster than what you get from Linux' built-in + /dev/urandom. And it uses very little (/dev/frandom) or none + (/dev/erandom) of the kernel's entropy pool, so it is very + useful for applications that require a handy source for lots + of random data. + + http://www.billauer.co.il/frandom.html + + If unsure here, select N. + endmenu diff --git a/drivers/char/frandom.c b/drivers/char/frandom.c index b824f66bf..c6add1eaf 100644 --- a/drivers/char/frandom.c +++ b/drivers/char/frandom.c @@ -1,6 +1,6 @@ /* ** frandom.c -** Fast pseudo-random generator +** Fast pseudo-random generator ** ** (c) Copyright 2003-2011 Eli Billauer ** http://www.billauer.co.il @@ -19,13 +19,13 @@ #include #include -#include -#include +#include +#include #include -#include +#include #include -#include +#include #include #include #include @@ -33,19 +33,18 @@ #define INTERNAL_SEED 0 #define EXTERNAL_SEED 1 -#define FRANDOM_MAJOR 235 -#define FRANDOM_MINOR 11 -#define ERANDOM_MINOR 12 +#define NR_FRANDOM_DEVS 2 -static struct file_operations frandom_fops; /* Values assigned below */ +static const struct file_operations frandom_fops; /* Values assigned below */ -static int erandom_seeded = 0; /* Internal flag */ +static int erandom_seeded; /* Internal flag */ -static int frandom_major = FRANDOM_MAJOR; -static int frandom_minor = FRANDOM_MINOR; -static int erandom_minor = ERANDOM_MINOR; +static dev_t frandom_devt; +static dev_t erandom_devt; +static int frandom_minor; +static int erandom_minor; static int frandom_bufsize = 256; -static int frandom_chunklimit = 0; /* =0 means unlimited */ +static int frandom_chunklimit; /* =0 means unlimited */ static struct cdev frandom_cdev; static struct cdev erandom_cdev; @@ -56,24 +55,20 @@ struct device *erandom_device; MODULE_DESCRIPTION("Fast pseudo-random number generator"); MODULE_LICENSE("GPL"); MODULE_AUTHOR("Eli Billauer"); -module_param(frandom_major, int, 0); -module_param(frandom_minor, int, 0); -module_param(erandom_minor, int, 0); module_param(frandom_bufsize, int, 0); module_param(frandom_chunklimit, int, 0); -MODULE_PARM_DESC(frandom_major,"Major number of /dev/frandom and /dev/erandom"); -MODULE_PARM_DESC(frandom_minor,"Minor number of /dev/frandom"); -MODULE_PARM_DESC(erandom_minor,"Minor number of /dev/erandom"); -MODULE_PARM_DESC(frandom_bufsize,"Internal buffer size in bytes. Default is 256. Must be >= 256"); -MODULE_PARM_DESC(frandom_chunklimit,"Limit for read() blocks size. 0 (default) is unlimited, otherwise must be >= 256"); +MODULE_PARM_DESC(frandom_bufsize, + "Internal buffer size in bytes. Default is 256. Must be >= 256"); +MODULE_PARM_DESC(frandom_chunklimit, + "Limit for read() blocks size. 0 (default) is unlimited," + "otherwise must be >= 256"); -struct frandom_state -{ +struct frandom_state { struct semaphore sem; /* Semaphore on the state structure */ u8 S[256]; /* The state array */ - u8 i; + u8 i; u8 j; char *buf; @@ -83,10 +78,10 @@ static struct frandom_state *erandom_state; static inline void swap_byte(u8 *a, u8 *b) { - u8 swapByte; - - swapByte = *a; - *a = *b; + u8 swapByte; + + swapByte = *a; + *a = *b; *b = swapByte; } @@ -100,7 +95,7 @@ void erandom_get_random_bytes(char *buf, size_t count) unsigned int i; unsigned int j; u8 *S; - + /* If we fail to get the semaphore, we revert to external random data. Since semaphore blocking is expected to be very rare, and interrupts during these rare and very short periods of time even less frequent, @@ -118,29 +113,30 @@ void erandom_get_random_bytes(char *buf, size_t count) RNG is already restored in the boot sequence (not critical, but better. */ - + if (!erandom_seeded) { erandom_seeded = 1; init_rand_state(state, EXTERNAL_SEED); - printk(KERN_INFO "frandom: Seeded global generator now (used by erandom)\n"); + pr_info("frandom: Seeded global generator now (used by erandom)\n"); } - i = state->i; + i = state->i; j = state->j; - S = state->S; + S = state->S; - for (k=0; ki = i; + + state->i = i; state->j = j; up(&state->sem); } +EXPORT_SYMBOL(erandom_get_random_bytes); static void init_rand_state(struct frandom_state *state, int seedflag) { @@ -154,13 +150,13 @@ static void init_rand_state(struct frandom_state *state, int seedflag) get_random_bytes(seed, 256); S = state->S; - for (i=0; i<256; i++) - *S++=i; + for (i = 0; i < 256; i++) + *S++ = i; - j=0; + j = 0; S = state->S; - for (i=0; i<256; i++) { + for (i = 0; i < 256; i++) { j = (j + S[i] + *seed++) & 0xff; swap_byte(&S[i], &S[j]); } @@ -169,8 +165,8 @@ static void init_rand_state(struct frandom_state *state, int seedflag) generated. So we do it: */ - i=0; j=0; - for (k=0; k<256; k++) { + i = 0; j = 0; + for (k = 0; k < 256; k++) { i = (i + 1) & 0xff; j = (j + S[i]) & 0xff; swap_byte(&S[i], &S[j]); @@ -182,16 +178,17 @@ static void init_rand_state(struct frandom_state *state, int seedflag) static int frandom_open(struct inode *inode, struct file *filp) { - + struct frandom_state *state; int num = iminor(inode); /* This should never happen, now when the minors are regsitered - * explicitly + * explicitly (or dynamically) */ - if ((num != frandom_minor) && (num != erandom_minor)) return -ENODEV; - + if ((num != frandom_minor) && (num != erandom_minor)) + return -ENODEV; + state = kmalloc(sizeof(struct frandom_state), GFP_KERNEL); if (!state) return -ENOMEM; @@ -221,7 +218,7 @@ static int frandom_release(struct inode *inode, struct file *filp) kfree(state->buf); kfree(state); - + return 0; } @@ -236,18 +233,18 @@ static ssize_t frandom_read(struct file *filp, char *buf, size_t count, unsigned int i; unsigned int j; u8 *S; - + if (down_interruptible(&state->sem)) return -ERESTARTSYS; - + if ((frandom_chunklimit > 0) && (count > frandom_chunklimit)) count = frandom_chunklimit; ret = count; /* It's either everything or an error... */ - - i = state->i; + + i = state->i; j = state->j; - S = state->S; + S = state->S; while (count) { if (count > frandom_bufsize) @@ -257,13 +254,13 @@ static ssize_t frandom_read(struct file *filp, char *buf, size_t count, localbuf = state->buf; - for (k=0; kbuf, dobytes)) { ret = -EFAULT; goto out; @@ -274,28 +271,26 @@ static ssize_t frandom_read(struct file *filp, char *buf, size_t count, } out: - state->i = i; + state->i = i; state->j = j; up(&state->sem); return ret; } -static struct file_operations frandom_fops = { - read: frandom_read, - open: frandom_open, - release: frandom_release, +static const struct file_operations frandom_fops = { + .read = frandom_read, + .open = frandom_open, + .release = frandom_release, }; -static void frandom_cleanup_module(void) { - unregister_chrdev_region(MKDEV(frandom_major, erandom_minor), 1); +static void frandom_cleanup_module(void) +{ + device_destroy(frandom_class, erandom_devt); cdev_del(&erandom_cdev); - device_destroy(frandom_class, MKDEV(frandom_major, erandom_minor)); - - unregister_chrdev_region(MKDEV(frandom_major, frandom_minor), 1); + device_destroy(frandom_class, frandom_devt); cdev_del(&frandom_cdev); - device_destroy(frandom_class, MKDEV(frandom_major, frandom_minor)); - class_destroy(frandom_class); + unregister_chrdev_region(frandom_devt, NR_FRANDOM_DEVS); kfree(erandom_state->buf); kfree(erandom_state); @@ -308,13 +303,15 @@ static int frandom_init_module(void) /* The buffer size MUST be at least 256 bytes, because we assume that minimal length in init_rand_state(). - */ + */ if (frandom_bufsize < 256) { - printk(KERN_ERR "frandom: Refused to load because frandom_bufsize=%d < 256\n",frandom_bufsize); + pr_err("frandom: Invalid frandom_bufsize: %d\n", + frandom_bufsize); return -EINVAL; } if ((frandom_chunklimit != 0) && (frandom_chunklimit < 256)) { - printk(KERN_ERR "frandom: Refused to load because frandom_chunklimit=%d < 256 and != 0\n",frandom_chunklimit); + pr_err("frandom: Invalid frandom_chunklimit: %d\n", + frandom_chunklimit); return -EINVAL; } @@ -337,84 +334,76 @@ static int frandom_init_module(void) frandom_class = class_create(THIS_MODULE, "fastrng"); if (IS_ERR(frandom_class)) { result = PTR_ERR(frandom_class); - printk(KERN_WARNING "frandom: Failed to register class fastrng\n"); + pr_warn("frandom: Failed to register class fastrng\n"); goto error0; } - + /* * Register your major, and accept a dynamic number. This is the * first thing to do, in order to avoid releasing other module's * fops in frandom_cleanup_module() */ + result = alloc_chrdev_region(&frandom_devt, 0, NR_FRANDOM_DEVS, + "frandom"); + if (result < 0) { + pr_warn("frandom: failed to alloc frandom region\n"); + goto error1; + } + + frandom_minor = MINOR(frandom_devt); + erandom_minor = frandom_minor + 1; + erandom_devt = MKDEV(MAJOR(frandom_devt), erandom_minor); + cdev_init(&frandom_cdev, &frandom_fops); frandom_cdev.owner = THIS_MODULE; - result = cdev_add(&frandom_cdev, MKDEV(frandom_major, frandom_minor), 1); + result = cdev_add(&frandom_cdev, frandom_devt, 1); if (result) { - printk(KERN_WARNING "frandom: Failed to add cdev for /dev/frandom\n"); - goto error1; - } - - result = register_chrdev_region(MKDEV(frandom_major, frandom_minor), 1, "/dev/frandom"); - if (result < 0) { - printk(KERN_WARNING "frandom: can't get major/minor %d/%d\n", frandom_major, frandom_minor); - goto error2; + pr_warn("frandom: Failed to add cdev for /dev/frandom\n"); + goto error2; } - frandom_device = device_create(frandom_class, NULL, MKDEV(frandom_major, frandom_minor), NULL, "frandom"); + frandom_device = device_create(frandom_class, NULL, frandom_devt, + NULL, "frandom"); if (IS_ERR(frandom_device)) { - printk(KERN_WARNING "frandom: Failed to create frandom device\n"); + pr_warn("frandom: Failed to create frandom device\n"); goto error3; } cdev_init(&erandom_cdev, &frandom_fops); erandom_cdev.owner = THIS_MODULE; - result = cdev_add(&erandom_cdev, MKDEV(frandom_major, erandom_minor), 1); + result = cdev_add(&erandom_cdev, erandom_devt, 1); if (result) { - printk(KERN_WARNING "frandom: Failed to add cdev for /dev/erandom\n"); - goto error4; + pr_warn("frandom: Failed to add cdev for /dev/erandom\n"); + goto error4; } - result = register_chrdev_region(MKDEV(frandom_major, erandom_minor), 1, "/dev/erandom"); - if (result < 0) { - printk(KERN_WARNING "frandom: can't get major/minor %d/%d\n", frandom_major, erandom_minor); - goto error5; - } - - erandom_device = device_create(frandom_class, NULL, MKDEV(frandom_major, erandom_minor), NULL, "erandom"); + erandom_device = device_create(frandom_class, NULL, erandom_devt, + NULL, "erandom"); if (IS_ERR(erandom_device)) { - printk(KERN_WARNING "frandom: Failed to create erandom device\n"); - goto error6; + pr_warn("frandom: Failed to create erandom device\n"); + goto error5; } return 0; /* succeed */ - error6: - unregister_chrdev_region(MKDEV(frandom_major, erandom_minor), 1); - error5: +error5: cdev_del(&erandom_cdev); - error4: - device_destroy(frandom_class, MKDEV(frandom_major, frandom_minor)); - error3: - unregister_chrdev_region(MKDEV(frandom_major, frandom_minor), 1); - error2: +error4: + device_destroy(frandom_class, frandom_devt); +error3: cdev_del(&frandom_cdev); - error1: +error2: + unregister_chrdev_region(frandom_devt, NR_FRANDOM_DEVS); +error1: class_destroy(frandom_class); - error0: +error0: kfree(erandom_state->buf); kfree(erandom_state); - return result; + return result; } module_init(frandom_init_module); module_exit(frandom_cleanup_module); - -EXPORT_SYMBOL(erandom_get_random_bytes); - -MODULE_AUTHOR("Eli Billauer "); -MODULE_DESCRIPTION("'char_random_frandom' - A fast random generator for " -"general usage"); -MODULE_LICENSE("GPL"); diff --git a/drivers/staging/Kconfig b/drivers/staging/Kconfig index 05b16f375..5d3b86a33 100644 --- a/drivers/staging/Kconfig +++ b/drivers/staging/Kconfig @@ -110,6 +110,4 @@ source "drivers/staging/wilc1000/Kconfig" source "drivers/staging/most/Kconfig" -source "drivers/staging/frandom/Kconfig" - endif # STAGING diff --git a/drivers/staging/Makefile b/drivers/staging/Makefile index 7efef9b7c..7fad58554 100644 --- a/drivers/staging/Makefile +++ b/drivers/staging/Makefile @@ -48,4 +48,3 @@ obj-$(CONFIG_FSL_MC_BUS) += fsl-mc/ obj-$(CONFIG_WILC1000) += wilc1000/ obj-$(CONFIG_MOST) += most/ -obj-$(CONFIG_FRANDOM) += frandom/ diff --git a/drivers/staging/frandom/Kconfig b/drivers/staging/frandom/Kconfig deleted file mode 100644 index 57529dded..000000000 --- a/drivers/staging/frandom/Kconfig +++ /dev/null @@ -1,14 +0,0 @@ -config FRANDOM - tristate "Frandom RNG driver" - help - Frandom is a Linux kernel random number generator, which - is 10-50 times faster than what you get from Linux' built-in - /dev/urandom. And it uses very little (/dev/frandom) or none - (/dev/erandom) of the kernel's entropy pool, so it is very - useful for applications that require a handy source for lots - of random data. - - http://www.billauer.co.il/frandom.html - - If unsure here, select N. - diff --git a/drivers/staging/frandom/Makefile b/drivers/staging/frandom/Makefile deleted file mode 100644 index 5a4ff4ef9..000000000 --- a/drivers/staging/frandom/Makefile +++ /dev/null @@ -1 +0,0 @@ -obj-$(CONFIG_FRANDOM) += frandom.o diff --git a/drivers/staging/frandom/frandom.c b/drivers/staging/frandom/frandom.c deleted file mode 100644 index c6add1eaf..000000000 --- a/drivers/staging/frandom/frandom.c +++ /dev/null @@ -1,409 +0,0 @@ -/* -** frandom.c -** Fast pseudo-random generator -** -** (c) Copyright 2003-2011 Eli Billauer -** http://www.billauer.co.il -** -** This program is free software; you can redistribute it and/or modify -** it under the terms of the GNU General Public License as published by -** the Free Software Foundation; either version 2 of the License, or -** (at your option) any later version. -** -** -*/ - -#include -#include -#include - -#include -#include -#include -#include -#include -#include -#include - -#include -#include -#include -#include - -#define INTERNAL_SEED 0 -#define EXTERNAL_SEED 1 - -#define NR_FRANDOM_DEVS 2 - -static const struct file_operations frandom_fops; /* Values assigned below */ - -static int erandom_seeded; /* Internal flag */ - -static dev_t frandom_devt; -static dev_t erandom_devt; -static int frandom_minor; -static int erandom_minor; -static int frandom_bufsize = 256; -static int frandom_chunklimit; /* =0 means unlimited */ - -static struct cdev frandom_cdev; -static struct cdev erandom_cdev; -static struct class *frandom_class; -struct device *frandom_device; -struct device *erandom_device; - -MODULE_DESCRIPTION("Fast pseudo-random number generator"); -MODULE_LICENSE("GPL"); -MODULE_AUTHOR("Eli Billauer"); -module_param(frandom_bufsize, int, 0); -module_param(frandom_chunklimit, int, 0); - -MODULE_PARM_DESC(frandom_bufsize, - "Internal buffer size in bytes. Default is 256. Must be >= 256"); -MODULE_PARM_DESC(frandom_chunklimit, - "Limit for read() blocks size. 0 (default) is unlimited," - "otherwise must be >= 256"); - -struct frandom_state { - struct semaphore sem; /* Semaphore on the state structure */ - - u8 S[256]; /* The state array */ - u8 i; - u8 j; - - char *buf; -}; - -static struct frandom_state *erandom_state; - -static inline void swap_byte(u8 *a, u8 *b) -{ - u8 swapByte; - - swapByte = *a; - *a = *b; - *b = swapByte; -} - -static void init_rand_state(struct frandom_state *state, int seedflag); - -void erandom_get_random_bytes(char *buf, size_t count) -{ - struct frandom_state *state = erandom_state; - int k; - - unsigned int i; - unsigned int j; - u8 *S; - - /* If we fail to get the semaphore, we revert to external random data. - Since semaphore blocking is expected to be very rare, and interrupts - during these rare and very short periods of time even less frequent, - we take the better-safe-than-sorry approach, and fill the buffer - some expensive random data, in case the caller wasn't aware of this - possibility, and expects random data anyhow. - */ - - if (down_interruptible(&state->sem)) { - get_random_bytes(buf, count); - return; - } - - /* We seed erandom as late as possible, hoping that the kernel's main - RNG is already restored in the boot sequence (not critical, but - better. - */ - - if (!erandom_seeded) { - erandom_seeded = 1; - init_rand_state(state, EXTERNAL_SEED); - pr_info("frandom: Seeded global generator now (used by erandom)\n"); - } - - i = state->i; - j = state->j; - S = state->S; - - for (k = 0; k < count; k++) { - i = (i + 1) & 0xff; - j = (j + S[i]) & 0xff; - swap_byte(&S[i], &S[j]); - *buf++ = S[(S[i] + S[j]) & 0xff]; - } - - state->i = i; - state->j = j; - - up(&state->sem); -} -EXPORT_SYMBOL(erandom_get_random_bytes); - -static void init_rand_state(struct frandom_state *state, int seedflag) -{ - unsigned int i, j, k; - u8 *S; - u8 *seed = state->buf; - - if (seedflag == INTERNAL_SEED) - erandom_get_random_bytes(seed, 256); - else - get_random_bytes(seed, 256); - - S = state->S; - for (i = 0; i < 256; i++) - *S++ = i; - - j = 0; - S = state->S; - - for (i = 0; i < 256; i++) { - j = (j + S[i] + *seed++) & 0xff; - swap_byte(&S[i], &S[j]); - } - - /* It's considered good practice to discard the first 256 bytes - generated. So we do it: - */ - - i = 0; j = 0; - for (k = 0; k < 256; k++) { - i = (i + 1) & 0xff; - j = (j + S[i]) & 0xff; - swap_byte(&S[i], &S[j]); - } - - state->i = i; /* Save state */ - state->j = j; -} - -static int frandom_open(struct inode *inode, struct file *filp) -{ - - struct frandom_state *state; - - int num = iminor(inode); - - /* This should never happen, now when the minors are regsitered - * explicitly (or dynamically) - */ - if ((num != frandom_minor) && (num != erandom_minor)) - return -ENODEV; - - state = kmalloc(sizeof(struct frandom_state), GFP_KERNEL); - if (!state) - return -ENOMEM; - - state->buf = kmalloc(frandom_bufsize, GFP_KERNEL); - if (!state->buf) { - kfree(state); - return -ENOMEM; - } - - sema_init(&state->sem, 1); /* Init semaphore as a mutex */ - - if (num == frandom_minor) - init_rand_state(state, EXTERNAL_SEED); - else - init_rand_state(state, INTERNAL_SEED); - - filp->private_data = state; - - return 0; /* Success */ -} - -static int frandom_release(struct inode *inode, struct file *filp) -{ - - struct frandom_state *state = filp->private_data; - - kfree(state->buf); - kfree(state); - - return 0; -} - -static ssize_t frandom_read(struct file *filp, char *buf, size_t count, - loff_t *f_pos) -{ - struct frandom_state *state = filp->private_data; - ssize_t ret; - int dobytes, k; - char *localbuf; - - unsigned int i; - unsigned int j; - u8 *S; - - if (down_interruptible(&state->sem)) - return -ERESTARTSYS; - - if ((frandom_chunklimit > 0) && (count > frandom_chunklimit)) - count = frandom_chunklimit; - - ret = count; /* It's either everything or an error... */ - - i = state->i; - j = state->j; - S = state->S; - - while (count) { - if (count > frandom_bufsize) - dobytes = frandom_bufsize; - else - dobytes = count; - - localbuf = state->buf; - - for (k = 0; k < dobytes; k++) { - i = (i + 1) & 0xff; - j = (j + S[i]) & 0xff; - swap_byte(&S[i], &S[j]); - *localbuf++ = S[(S[i] + S[j]) & 0xff]; - } - - if (copy_to_user(buf, state->buf, dobytes)) { - ret = -EFAULT; - goto out; - } - - buf += dobytes; - count -= dobytes; - } - - out: - state->i = i; - state->j = j; - - up(&state->sem); - return ret; -} - -static const struct file_operations frandom_fops = { - .read = frandom_read, - .open = frandom_open, - .release = frandom_release, -}; - -static void frandom_cleanup_module(void) -{ - device_destroy(frandom_class, erandom_devt); - cdev_del(&erandom_cdev); - device_destroy(frandom_class, frandom_devt); - cdev_del(&frandom_cdev); - unregister_chrdev_region(frandom_devt, NR_FRANDOM_DEVS); - - kfree(erandom_state->buf); - kfree(erandom_state); -} - - -static int frandom_init_module(void) -{ - int result; - - /* The buffer size MUST be at least 256 bytes, because we assume that - minimal length in init_rand_state(). - */ - if (frandom_bufsize < 256) { - pr_err("frandom: Invalid frandom_bufsize: %d\n", - frandom_bufsize); - return -EINVAL; - } - if ((frandom_chunklimit != 0) && (frandom_chunklimit < 256)) { - pr_err("frandom: Invalid frandom_chunklimit: %d\n", - frandom_chunklimit); - return -EINVAL; - } - - erandom_state = kmalloc(sizeof(struct frandom_state), GFP_KERNEL); - if (!erandom_state) - return -ENOMEM; - - /* This specific buffer is only used for seeding, so we need - 256 bytes exactly */ - erandom_state->buf = kmalloc(256, GFP_KERNEL); - if (!erandom_state->buf) { - kfree(erandom_state); - return -ENOMEM; - } - - sema_init(&erandom_state->sem, 1); /* Init semaphore as a mutex */ - - erandom_seeded = 0; - - frandom_class = class_create(THIS_MODULE, "fastrng"); - if (IS_ERR(frandom_class)) { - result = PTR_ERR(frandom_class); - pr_warn("frandom: Failed to register class fastrng\n"); - goto error0; - } - - /* - * Register your major, and accept a dynamic number. This is the - * first thing to do, in order to avoid releasing other module's - * fops in frandom_cleanup_module() - */ - - result = alloc_chrdev_region(&frandom_devt, 0, NR_FRANDOM_DEVS, - "frandom"); - if (result < 0) { - pr_warn("frandom: failed to alloc frandom region\n"); - goto error1; - } - - frandom_minor = MINOR(frandom_devt); - erandom_minor = frandom_minor + 1; - erandom_devt = MKDEV(MAJOR(frandom_devt), erandom_minor); - - cdev_init(&frandom_cdev, &frandom_fops); - frandom_cdev.owner = THIS_MODULE; - result = cdev_add(&frandom_cdev, frandom_devt, 1); - if (result) { - pr_warn("frandom: Failed to add cdev for /dev/frandom\n"); - goto error2; - } - - frandom_device = device_create(frandom_class, NULL, frandom_devt, - NULL, "frandom"); - - if (IS_ERR(frandom_device)) { - pr_warn("frandom: Failed to create frandom device\n"); - goto error3; - } - - cdev_init(&erandom_cdev, &frandom_fops); - erandom_cdev.owner = THIS_MODULE; - result = cdev_add(&erandom_cdev, erandom_devt, 1); - if (result) { - pr_warn("frandom: Failed to add cdev for /dev/erandom\n"); - goto error4; - } - - erandom_device = device_create(frandom_class, NULL, erandom_devt, - NULL, "erandom"); - - if (IS_ERR(erandom_device)) { - pr_warn("frandom: Failed to create erandom device\n"); - goto error5; - } - return 0; /* succeed */ - -error5: - cdev_del(&erandom_cdev); -error4: - device_destroy(frandom_class, frandom_devt); -error3: - cdev_del(&frandom_cdev); -error2: - unregister_chrdev_region(frandom_devt, NR_FRANDOM_DEVS); -error1: - class_destroy(frandom_class); -error0: - kfree(erandom_state->buf); - kfree(erandom_state); - - return result; -} - -module_init(frandom_init_module); -module_exit(frandom_cleanup_module); From 2ed64604bc3240191d21b639e1b95036af38b8ec Mon Sep 17 00:00:00 2001 From: Meninblack007 Date: Sun, 17 Jan 2016 02:33:34 -0500 Subject: [PATCH 115/146] upgrade frandom to latest Signed-off-by: Meninblack007 Signed-off-by: Ryan Andri --- drivers/char/frandom.c | 194 ++++++++++++++++++++++------------------- 1 file changed, 103 insertions(+), 91 deletions(-) diff --git a/drivers/char/frandom.c b/drivers/char/frandom.c index c6add1eaf..eb042a343 100644 --- a/drivers/char/frandom.c +++ b/drivers/char/frandom.c @@ -1,6 +1,6 @@ /* ** frandom.c -** Fast pseudo-random generator +** Fast pseudo-random generator ** ** (c) Copyright 2003-2011 Eli Billauer ** http://www.billauer.co.il @@ -19,13 +19,13 @@ #include #include -#include -#include +#include +#include #include -#include +#include #include -#include +#include #include #include #include @@ -33,18 +33,19 @@ #define INTERNAL_SEED 0 #define EXTERNAL_SEED 1 -#define NR_FRANDOM_DEVS 2 +#define FRANDOM_MAJOR 235 +#define FRANDOM_MINOR 11 +#define ERANDOM_MINOR 12 -static const struct file_operations frandom_fops; /* Values assigned below */ +static struct file_operations frandom_fops; /* Values assigned below */ -static int erandom_seeded; /* Internal flag */ +static int erandom_seeded = 0; /* Internal flag */ -static dev_t frandom_devt; -static dev_t erandom_devt; -static int frandom_minor; -static int erandom_minor; +static int frandom_major = FRANDOM_MAJOR; +static int frandom_minor = FRANDOM_MINOR; +static int erandom_minor = ERANDOM_MINOR; static int frandom_bufsize = 256; -static int frandom_chunklimit; /* =0 means unlimited */ +static int frandom_chunklimit = 0; /* =0 means unlimited */ static struct cdev frandom_cdev; static struct cdev erandom_cdev; @@ -55,20 +56,24 @@ struct device *erandom_device; MODULE_DESCRIPTION("Fast pseudo-random number generator"); MODULE_LICENSE("GPL"); MODULE_AUTHOR("Eli Billauer"); +module_param(frandom_major, int, 0); +module_param(frandom_minor, int, 0); +module_param(erandom_minor, int, 0); module_param(frandom_bufsize, int, 0); module_param(frandom_chunklimit, int, 0); -MODULE_PARM_DESC(frandom_bufsize, - "Internal buffer size in bytes. Default is 256. Must be >= 256"); -MODULE_PARM_DESC(frandom_chunklimit, - "Limit for read() blocks size. 0 (default) is unlimited," - "otherwise must be >= 256"); +MODULE_PARM_DESC(frandom_major,"Major number of /dev/frandom and /dev/erandom"); +MODULE_PARM_DESC(frandom_minor,"Minor number of /dev/frandom"); +MODULE_PARM_DESC(erandom_minor,"Minor number of /dev/erandom"); +MODULE_PARM_DESC(frandom_bufsize,"Internal buffer size in bytes. Default is 256. Must be >= 256"); +MODULE_PARM_DESC(frandom_chunklimit,"Limit for read() blocks size. 0 (default) is unlimited, otherwise must be >= 256"); -struct frandom_state { +struct frandom_state +{ struct semaphore sem; /* Semaphore on the state structure */ u8 S[256]; /* The state array */ - u8 i; + u8 i; u8 j; char *buf; @@ -78,10 +83,10 @@ static struct frandom_state *erandom_state; static inline void swap_byte(u8 *a, u8 *b) { - u8 swapByte; - - swapByte = *a; - *a = *b; + u8 swapByte; + + swapByte = *a; + *a = *b; *b = swapByte; } @@ -95,7 +100,7 @@ void erandom_get_random_bytes(char *buf, size_t count) unsigned int i; unsigned int j; u8 *S; - + /* If we fail to get the semaphore, we revert to external random data. Since semaphore blocking is expected to be very rare, and interrupts during these rare and very short periods of time even less frequent, @@ -113,30 +118,29 @@ void erandom_get_random_bytes(char *buf, size_t count) RNG is already restored in the boot sequence (not critical, but better. */ - + if (!erandom_seeded) { erandom_seeded = 1; init_rand_state(state, EXTERNAL_SEED); - pr_info("frandom: Seeded global generator now (used by erandom)\n"); + printk(KERN_INFO "frandom: Seeded global generator now (used by erandom)\n"); } - i = state->i; + i = state->i; j = state->j; - S = state->S; + S = state->S; - for (k = 0; k < count; k++) { + for (k=0; ki = i; + + state->i = i; state->j = j; up(&state->sem); } -EXPORT_SYMBOL(erandom_get_random_bytes); static void init_rand_state(struct frandom_state *state, int seedflag) { @@ -150,13 +154,13 @@ static void init_rand_state(struct frandom_state *state, int seedflag) get_random_bytes(seed, 256); S = state->S; - for (i = 0; i < 256; i++) - *S++ = i; + for (i=0; i<256; i++) + *S++=i; - j = 0; + j=0; S = state->S; - for (i = 0; i < 256; i++) { + for (i=0; i<256; i++) { j = (j + S[i] + *seed++) & 0xff; swap_byte(&S[i], &S[j]); } @@ -165,8 +169,8 @@ static void init_rand_state(struct frandom_state *state, int seedflag) generated. So we do it: */ - i = 0; j = 0; - for (k = 0; k < 256; k++) { + i=0; j=0; + for (k=0; k<256; k++) { i = (i + 1) & 0xff; j = (j + S[i]) & 0xff; swap_byte(&S[i], &S[j]); @@ -184,10 +188,9 @@ static int frandom_open(struct inode *inode, struct file *filp) int num = iminor(inode); /* This should never happen, now when the minors are regsitered - * explicitly (or dynamically) + * explicitly */ - if ((num != frandom_minor) && (num != erandom_minor)) - return -ENODEV; + if ((num != frandom_minor) && (num != erandom_minor)) return -ENODEV; state = kmalloc(sizeof(struct frandom_state), GFP_KERNEL); if (!state) @@ -244,7 +247,7 @@ static ssize_t frandom_read(struct file *filp, char *buf, size_t count, i = state->i; j = state->j; - S = state->S; + S = state->S; while (count) { if (count > frandom_bufsize) @@ -254,7 +257,7 @@ static ssize_t frandom_read(struct file *filp, char *buf, size_t count, localbuf = state->buf; - for (k = 0; k < dobytes; k++) { + for (k=0; kbuf); kfree(erandom_state); @@ -305,13 +310,11 @@ static int frandom_init_module(void) minimal length in init_rand_state(). */ if (frandom_bufsize < 256) { - pr_err("frandom: Invalid frandom_bufsize: %d\n", - frandom_bufsize); + printk(KERN_ERR "frandom: Refused to load because frandom_bufsize=%d < 256\n",frandom_bufsize); return -EINVAL; } if ((frandom_chunklimit != 0) && (frandom_chunklimit < 256)) { - pr_err("frandom: Invalid frandom_chunklimit: %d\n", - frandom_chunklimit); + printk(KERN_ERR "frandom: Refused to load because frandom_chunklimit=%d < 256 and != 0\n",frandom_chunklimit); return -EINVAL; } @@ -334,7 +337,7 @@ static int frandom_init_module(void) frandom_class = class_create(THIS_MODULE, "fastrng"); if (IS_ERR(frandom_class)) { result = PTR_ERR(frandom_class); - pr_warn("frandom: Failed to register class fastrng\n"); + printk(KERN_WARNING "frandom: Failed to register class fastrng\n"); goto error0; } @@ -344,66 +347,75 @@ static int frandom_init_module(void) * fops in frandom_cleanup_module() */ - result = alloc_chrdev_region(&frandom_devt, 0, NR_FRANDOM_DEVS, - "frandom"); - if (result < 0) { - pr_warn("frandom: failed to alloc frandom region\n"); - goto error1; - } - - frandom_minor = MINOR(frandom_devt); - erandom_minor = frandom_minor + 1; - erandom_devt = MKDEV(MAJOR(frandom_devt), erandom_minor); - cdev_init(&frandom_cdev, &frandom_fops); frandom_cdev.owner = THIS_MODULE; - result = cdev_add(&frandom_cdev, frandom_devt, 1); + result = cdev_add(&frandom_cdev, MKDEV(frandom_major, frandom_minor), 1); if (result) { - pr_warn("frandom: Failed to add cdev for /dev/frandom\n"); - goto error2; + printk(KERN_WARNING "frandom: Failed to add cdev for /dev/frandom\n"); + goto error1; } - frandom_device = device_create(frandom_class, NULL, frandom_devt, - NULL, "frandom"); + result = register_chrdev_region(MKDEV(frandom_major, frandom_minor), 1, "/dev/frandom"); + if (result < 0) { + printk(KERN_WARNING "frandom: can't get major/minor %d/%d\n", frandom_major, frandom_minor); + goto error2; + } + + frandom_device = device_create(frandom_class, NULL, MKDEV(frandom_major, frandom_minor), NULL, "frandom"); if (IS_ERR(frandom_device)) { - pr_warn("frandom: Failed to create frandom device\n"); + printk(KERN_WARNING "frandom: Failed to create frandom device\n"); goto error3; } cdev_init(&erandom_cdev, &frandom_fops); erandom_cdev.owner = THIS_MODULE; - result = cdev_add(&erandom_cdev, erandom_devt, 1); + result = cdev_add(&erandom_cdev, MKDEV(frandom_major, erandom_minor), 1); if (result) { - pr_warn("frandom: Failed to add cdev for /dev/erandom\n"); - goto error4; + printk(KERN_WARNING "frandom: Failed to add cdev for /dev/erandom\n"); + goto error4; + } + + result = register_chrdev_region(MKDEV(frandom_major, erandom_minor), 1, "/dev/erandom"); + if (result < 0) { + printk(KERN_WARNING "frandom: can't get major/minor %d/%d\n", frandom_major, erandom_minor); + goto error5; } - erandom_device = device_create(frandom_class, NULL, erandom_devt, - NULL, "erandom"); + erandom_device = device_create(frandom_class, NULL, MKDEV(frandom_major, erandom_minor), NULL, "erandom"); if (IS_ERR(erandom_device)) { - pr_warn("frandom: Failed to create erandom device\n"); - goto error5; + printk(KERN_WARNING "frandom: Failed to create erandom device\n"); + goto error6; } return 0; /* succeed */ -error5: + error6: + unregister_chrdev_region(MKDEV(frandom_major, erandom_minor), 1); + error5: cdev_del(&erandom_cdev); -error4: - device_destroy(frandom_class, frandom_devt); -error3: + error4: + device_destroy(frandom_class, MKDEV(frandom_major, frandom_minor)); + error3: + unregister_chrdev_region(MKDEV(frandom_major, frandom_minor), 1); + error2: cdev_del(&frandom_cdev); -error2: - unregister_chrdev_region(frandom_devt, NR_FRANDOM_DEVS); -error1: + error1: class_destroy(frandom_class); -error0: + error0: kfree(erandom_state->buf); kfree(erandom_state); - return result; + return result; } module_init(frandom_init_module); module_exit(frandom_cleanup_module); + +EXPORT_SYMBOL(erandom_get_random_bytes); + +MODULE_AUTHOR("Eli Billauer "); +MODULE_DESCRIPTION("'char_random_frandom' - A fast random generator for " +"general usage"); +MODULE_LICENSE("GPL"); + From 3781cf744fc9799a63b91432b85ccae3edf8e8bd Mon Sep 17 00:00:00 2001 From: UchihaDev Date: Tue, 1 May 2018 21:00:06 +0545 Subject: [PATCH 116/146] Enable Frandom --- arch/arm64/configs/hyperplus_defconfig | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/arm64/configs/hyperplus_defconfig b/arch/arm64/configs/hyperplus_defconfig index ee615263a..705240cf2 100644 --- a/arch/arm64/configs/hyperplus_defconfig +++ b/arch/arm64/configs/hyperplus_defconfig @@ -3,6 +3,7 @@ # Linux/arm64 4.4.126 Kernel Configuration # +CONFIG_FRANDOM=y CONFIG_WIREGUARD=y # CONFIG_WIREGUARD_DEBUG is not set CONFIG_IOSCHED_ZEN=y From 285d189bb03c48fef84e1cae4cd527b9483e1e3f Mon Sep 17 00:00:00 2001 From: ramgear Date: Tue, 3 Sep 2013 14:25:04 +0700 Subject: [PATCH 117/146] int_sqrt.c: Correction square root algo with naming Signed-off-by: engstk (cherry picked from commit 1161ddce014ae51c3f6fd8338f2d6b1d7756f3c5) --- lib/int_sqrt.c | 30 ++++++++++++++++++++---------- 1 file changed, 20 insertions(+), 10 deletions(-) diff --git a/lib/int_sqrt.c b/lib/int_sqrt.c index 1ef4cc344..0814f0762 100644 --- a/lib/int_sqrt.c +++ b/lib/int_sqrt.c @@ -16,23 +16,33 @@ */ unsigned long int_sqrt(unsigned long x) { - unsigned long b, m, y = 0; + unsigned long tmp; + unsigned long place; + unsigned long root; + unsigned long remainder; if (x <= 1) return x; - m = 1UL << (BITS_PER_LONG - 2); - while (m != 0) { - b = y + m; - y >>= 1; + root = 0; + remainder = x; + place = 1UL << (BITS_PER_LONG - 2); + + while (place > remainder) + place >>= 2; - if (x >= b) { - x -= b; - y += m; + while (place != 0) { + tmp = root + place; + + if (remainder >= tmp) + { + remainder -= tmp; + root += (place << 1); } - m >>= 2; + root >>= 1; + place >>= 2; } - return y; + return root; } EXPORT_SYMBOL(int_sqrt); From b04f536ac5138f416431e94189281f55af1c2115 Mon Sep 17 00:00:00 2001 From: FlyFrog Date: Wed, 4 Sep 2013 12:45:44 +0700 Subject: [PATCH 118/146] int_sqrt: Improve 3x faster integer sqrt. Result on 10,000,000 call. Old: sqrt(12345689) = 3513 real 0m0.768s user 0m0.760s sys 0m0.004s New: sqrt(12345689) = 3513 real 0m0.222s user 0m0.224s sys 0m0.000s Signed-off-by: engstk (cherry picked from commit c58f9a9d5c805a4e122790dd630de1f7ee4366df) --- lib/int_sqrt.c | 28 +++++++++++++--------------- 1 file changed, 13 insertions(+), 15 deletions(-) diff --git a/lib/int_sqrt.c b/lib/int_sqrt.c index 0814f0762..5c3916d09 100644 --- a/lib/int_sqrt.c +++ b/lib/int_sqrt.c @@ -14,34 +14,32 @@ * * A very rough approximation to the sqrt() function. */ -unsigned long int_sqrt(unsigned long x) +inline unsigned long int_sqrt(unsigned long x) { - unsigned long tmp; - unsigned long place; - unsigned long root; - unsigned long remainder; + register unsigned long tmp; + register unsigned long place; + register unsigned long root = 0; if (x <= 1) return x; - root = 0; - remainder = x; place = 1UL << (BITS_PER_LONG - 2); - - while (place > remainder) + + do{ place >>= 2; + }while(place > x); - while (place != 0) { + do { tmp = root + place; + root >>= 1; - if (remainder >= tmp) + if (x >= tmp) { - remainder -= tmp; - root += (place << 1); + x -= tmp; + root += place; } - root >>= 1; place >>= 2; - } + }while (place != 0); return root; } From 871b93b69e6a4ebbbf08f1c8c40ad9107202a411 Mon Sep 17 00:00:00 2001 From: Sultanxda Date: Tue, 2 Jan 2018 19:19:14 -0800 Subject: [PATCH 119/146] cpu: Silence log spam when a CPU is brought up Signed-off-by: Sultanxda --- arch/arm64/kernel/smp.c | 2 +- arch/arm64/kernel/topology.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/arm64/kernel/smp.c b/arch/arm64/kernel/smp.c index 7dae55b31..f4a28990a 100644 --- a/arch/arm64/kernel/smp.c +++ b/arch/arm64/kernel/smp.c @@ -244,7 +244,7 @@ asmlinkage void secondary_start_kernel(void) * the CPU migration code to notice that the CPU is online * before we continue. */ - pr_info("CPU%u: Booted secondary processor [%08x]\n", + pr_debug("CPU%u: Booted secondary processor [%08x]\n", cpu, read_cpuid_id()); update_cpu_boot_status(CPU_BOOT_SUCCESS); /* Make sure the status update is visible before we complete */ diff --git a/arch/arm64/kernel/topology.c b/arch/arm64/kernel/topology.c index a38acc3d1..e55824012 100644 --- a/arch/arm64/kernel/topology.c +++ b/arch/arm64/kernel/topology.c @@ -369,7 +369,7 @@ static void update_cpu_capacity(unsigned int cpu) set_capacity_scale(cpu, capacity); - pr_info("CPU%d: update cpu_capacity %lu\n", + pr_debug("CPU%d: update cpu_capacity %lu\n", cpu, arch_scale_cpu_capacity(NULL, cpu)); } From ba7f113b27039e2b4726df64c1d9d62854d0f553 Mon Sep 17 00:00:00 2001 From: acuicultor Date: Mon, 30 Apr 2018 13:16:54 +0200 Subject: [PATCH 120/146] Makefile: use ccache --- Makefile | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/Makefile b/Makefile index 48c9f7116..dc5541bbf 100644 --- a/Makefile +++ b/Makefile @@ -224,6 +224,8 @@ VPATH := $(srctree)$(if $(KBUILD_EXTMOD),:$(KBUILD_EXTMOD)) export srctree objtree VPATH +CCACHE := ccache + # SUBARCH tells the usermode build what the underlying arch is. That is set # first, and if a usermode build is happening, the "ARCH=um" on the command # line overrides the setting of ARCH below. If a native build is happening, @@ -258,7 +260,7 @@ SUBARCH := $(shell uname -m | sed -e s/i.86/x86/ -e s/x86_64/x86/ \ # Default value for CROSS_COMPILE is not to prefix executables # Note: Some architectures assign CROSS_COMPILE in their arch/*/Makefile ARCH ?= arm64 -CROSS_COMPILE ?= $(CONFIG_CROSS_COMPILE:"%"=%) +CROSS_COMPILE ?= $(CCACHE) $(CONFIG_CROSS_COMPILE:"%"=%) # Architecture as present in compile.h UTS_MACHINE := $(ARCH) @@ -304,9 +306,9 @@ CONFIG_SHELL := $(shell if [ -x "$$BASH" ]; then echo $$BASH; \ else if [ -x /bin/bash ]; then echo /bin/bash; \ else echo sh; fi ; fi) -HOSTCC = gcc -HOSTCXX = g++ -HOSTCFLAGS = -Wall -Wmissing-prototypes -Wstrict-prototypes -O2 -fomit-frame-pointer -std=gnu89 +HOSTCC = $(CCACHE) gcc +HOSTCXX = $(CCACHE) g++ +HOSTCFLAGS := -Wall -Wmissing-prototypes -Wstrict-prototypes -O2 -fomit-frame-pointer -std=gnu89 HOSTCXXFLAGS = -O2 ifeq ($(shell $(HOSTCC) -v 2>&1 | grep -c "clang version"), 1) @@ -351,7 +353,7 @@ include scripts/Kbuild.include # Make variables (CC, etc...) AS = $(CROSS_COMPILE)as LD = $(CROSS_COMPILE)ld -CC = $(CROSS_COMPILE)gcc +CC = $(CCACHE) $(CROSS_COMPILE)gcc CPP = $(CC) -E AR = $(CROSS_COMPILE)ar NM = $(CROSS_COMPILE)nm From 95334682cb9087d3dc398c799a99e20f35e0c124 Mon Sep 17 00:00:00 2001 From: engstk Date: Sat, 28 Apr 2018 01:43:59 +0100 Subject: [PATCH 121/146] defconfig: make all builtin Signed-off-by: engstk --- arch/arm64/configs/hyperplus_defconfig | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/arm64/configs/hyperplus_defconfig b/arch/arm64/configs/hyperplus_defconfig index 705240cf2..6fcf7becb 100644 --- a/arch/arm64/configs/hyperplus_defconfig +++ b/arch/arm64/configs/hyperplus_defconfig @@ -1790,7 +1790,7 @@ CONFIG_PPP_DEFLATE=y CONFIG_PPP_FILTER=y CONFIG_PPP_MPPE=y CONFIG_PPP_MULTILINK=y -CONFIG_PPPOE=m +CONFIG_PPPOE=y CONFIG_PPPOLAC=y CONFIG_PPPOPNS=y CONFIG_PPP_ASYNC=y From e82afe02a8ce2e8caedbd91148739212ee9ea3fe Mon Sep 17 00:00:00 2001 From: engstk Date: Sun, 29 Apr 2018 00:31:46 +0100 Subject: [PATCH 122/146] defconfig: enable missing filesystem options Signed-off-by: engstk --- arch/arm64/configs/hyperplus_defconfig | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/arch/arm64/configs/hyperplus_defconfig b/arch/arm64/configs/hyperplus_defconfig index 6fcf7becb..8f687ebfd 100644 --- a/arch/arm64/configs/hyperplus_defconfig +++ b/arch/arm64/configs/hyperplus_defconfig @@ -4809,7 +4809,9 @@ CONFIG_MSDOS_FS=y CONFIG_VFAT_FS=y CONFIG_FAT_DEFAULT_CODEPAGE=437 CONFIG_FAT_DEFAULT_IOCHARSET="iso8859-1" -# CONFIG_NTFS_FS is not set +CONFIG_NTFS_FS=y +# CONFIG_NTFS_DEBUG is not set +CONFIG_NTFS_RW=y CONFIG_EXFAT_FS=y CONFIG_EXFAT_DISCARD=y CONFIG_EXFAT_DELAYED_SYNC=y From 712210db7ccbe68c7e2682d56c4c6d024a9c49bf Mon Sep 17 00:00:00 2001 From: Alexander Martinz Date: Mon, 14 Aug 2017 09:39:26 +0200 Subject: [PATCH 123/146] include: asm-generic: do not hard code HZ to 100 Use the defconfig value instead Change-Id: I529d08cf06d64f1e3976c0cd876f40b7efefa222 Signed-off-by: Alexander Martinz Signed-off-by: engstk --- include/asm-generic/param.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/asm-generic/param.h b/include/asm-generic/param.h index 04e715bcc..3205b4263 100644 --- a/include/asm-generic/param.h +++ b/include/asm-generic/param.h @@ -5,6 +5,6 @@ # undef HZ # define HZ CONFIG_HZ /* Internal kernel timer frequency */ -# define USER_HZ 100 /* some user interfaces are */ +# define USER_HZ CONFIG_HZ /* some user interfaces are */ # define CLOCKS_PER_SEC (USER_HZ) /* in "ticks" like times() */ #endif /* __ASM_GENERIC_PARAM_H */ From a68b9ff0abac4a35eb5a820fdc84303a5a3fb2b9 Mon Sep 17 00:00:00 2001 From: UchihaDev Date: Wed, 2 May 2018 15:29:07 +0545 Subject: [PATCH 124/146] Revert "Enable WireGuard and Permessive SELINUX" This reverts commit 1cba0251b2836b7555e7a7f7131ce903a9f36351. --- net/Kconfig | 1 - net/Makefile | 1 - scripts/Kbuild.include | 2 +- scripts/fetch-latest-wireguard.sh | 18 ------------------ 4 files changed, 1 insertion(+), 21 deletions(-) delete mode 100755 scripts/fetch-latest-wireguard.sh diff --git a/net/Kconfig b/net/Kconfig index 5a4c329ed..16a19fd14 100644 --- a/net/Kconfig +++ b/net/Kconfig @@ -80,7 +80,6 @@ config INET Short answer: say Y. if INET -source "net/wireguard/Kconfig" source "net/ipv4/Kconfig" source "net/ipv6/Kconfig" source "net/netlabel/Kconfig" diff --git a/net/Makefile b/net/Makefile index 01a5c6426..c26cc887b 100644 --- a/net/Makefile +++ b/net/Makefile @@ -11,7 +11,6 @@ tmp-$(CONFIG_COMPAT) := compat.o obj-$(CONFIG_NET) += $(tmp-y) # LLC has to be linked before the files in net/802/ -obj-$(CONFIG_WIREGUARD) += wireguard/ obj-$(CONFIG_LLC) += llc/ obj-$(CONFIG_NET) += ethernet/ 802/ sched/ netlink/ obj-$(CONFIG_NETFILTER) += netfilter/ diff --git a/scripts/Kbuild.include b/scripts/Kbuild.include index 02d50ae95..1db6d73c8 100644 --- a/scripts/Kbuild.include +++ b/scripts/Kbuild.include @@ -307,7 +307,7 @@ why = \ echo-why = $(call escsq, $(strip $(why))) endif -$(shell cd "$(srctree)" && ./scripts/fetch-latest-wireguard.sh) + ############################################################################### # # When a Kconfig string contains a filename, it is suitable for diff --git a/scripts/fetch-latest-wireguard.sh b/scripts/fetch-latest-wireguard.sh deleted file mode 100755 index dcc25dec7..000000000 --- a/scripts/fetch-latest-wireguard.sh +++ /dev/null @@ -1,18 +0,0 @@ -#!/bin/bash -set -e -USER_AGENT="WireGuard-AndroidROMBuild/0.1 ($(uname -a))" - -[[ $(( $(date +%s) - $(stat -c %Y "net/wireguard/.check" 2>/dev/null || echo 0) )) -gt 86400 ]] || exit 0 - -[[ $(curl -A "$USER_AGENT" -LSs https://git.zx2c4.com/WireGuard/refs/) =~ snapshot/WireGuard-([0-9.]+)\.tar\.xz ]] - -if [[ -f net/wireguard/version.h && $(< net/wireguard/version.h) == *${BASH_REMATCH[1]}* ]]; then - touch net/wireguard/.check - exit 0 -fi - -rm -rf net/wireguard -mkdir -p net/wireguard -curl -A "$USER_AGENT" -LsS "https://git.zx2c4.com/WireGuard/snapshot/WireGuard-${BASH_REMATCH[1]}.tar.xz" | tar -C "net/wireguard" -xJf - --strip-components=2 "WireGuard-${BASH_REMATCH[1]}/src" -sed -i 's/tristate/bool/;s/default m/default y/;' net/wireguard/Kconfig -touch net/wireguard/.check From 7c0709db82f075b9df7190f8500d1ab9f61f1b1c Mon Sep 17 00:00:00 2001 From: "Jason A. Donenfeld" Date: Tue, 12 Dec 2017 14:49:22 +0000 Subject: [PATCH 125/146] net/wireguard: add to tree via updater mechanism Signed-off-by: engstk --- .gitignore | 2 ++ net/Kconfig | 1 + net/Makefile | 1 + scripts/Kbuild.include | 1 + scripts/fetch-latest-wireguard.sh | 18 ++++++++++++++++++ 5 files changed, 23 insertions(+) create mode 100755 scripts/fetch-latest-wireguard.sh diff --git a/.gitignore b/.gitignore index dea29bea5..16325342f 100644 --- a/.gitignore +++ b/.gitignore @@ -1,3 +1,5 @@ drivers/hisi/tzdriver/cfc_graph.pyc drivers/hisi/tzdriver/cfc_graphgen.pyc drivers/hisi/tzdriver/cfc_rule_parser.pyc +drivers/huawei_platform/oases/inlinehook_offset.h +net/wireguard/ diff --git a/net/Kconfig b/net/Kconfig index 16a19fd14..5a4c329ed 100644 --- a/net/Kconfig +++ b/net/Kconfig @@ -80,6 +80,7 @@ config INET Short answer: say Y. if INET +source "net/wireguard/Kconfig" source "net/ipv4/Kconfig" source "net/ipv6/Kconfig" source "net/netlabel/Kconfig" diff --git a/net/Makefile b/net/Makefile index c26cc887b..ec75f707b 100644 --- a/net/Makefile +++ b/net/Makefile @@ -14,6 +14,7 @@ obj-$(CONFIG_NET) += $(tmp-y) obj-$(CONFIG_LLC) += llc/ obj-$(CONFIG_NET) += ethernet/ 802/ sched/ netlink/ obj-$(CONFIG_NETFILTER) += netfilter/ +obj-$(CONFIG_WIREGUARD) += wireguard/ obj-$(CONFIG_INET) += ipv4/ obj-$(CONFIG_XFRM) += xfrm/ obj-$(CONFIG_UNIX) += unix/ diff --git a/scripts/Kbuild.include b/scripts/Kbuild.include index 1db6d73c8..fa7e7c8cb 100644 --- a/scripts/Kbuild.include +++ b/scripts/Kbuild.include @@ -358,3 +358,4 @@ endif endef # ############################################################################### +$(shell cd "$(srctree)" && ./scripts/fetch-latest-wireguard.sh) diff --git a/scripts/fetch-latest-wireguard.sh b/scripts/fetch-latest-wireguard.sh new file mode 100755 index 000000000..dcc25dec7 --- /dev/null +++ b/scripts/fetch-latest-wireguard.sh @@ -0,0 +1,18 @@ +#!/bin/bash +set -e +USER_AGENT="WireGuard-AndroidROMBuild/0.1 ($(uname -a))" + +[[ $(( $(date +%s) - $(stat -c %Y "net/wireguard/.check" 2>/dev/null || echo 0) )) -gt 86400 ]] || exit 0 + +[[ $(curl -A "$USER_AGENT" -LSs https://git.zx2c4.com/WireGuard/refs/) =~ snapshot/WireGuard-([0-9.]+)\.tar\.xz ]] + +if [[ -f net/wireguard/version.h && $(< net/wireguard/version.h) == *${BASH_REMATCH[1]}* ]]; then + touch net/wireguard/.check + exit 0 +fi + +rm -rf net/wireguard +mkdir -p net/wireguard +curl -A "$USER_AGENT" -LsS "https://git.zx2c4.com/WireGuard/snapshot/WireGuard-${BASH_REMATCH[1]}.tar.xz" | tar -C "net/wireguard" -xJf - --strip-components=2 "WireGuard-${BASH_REMATCH[1]}/src" +sed -i 's/tristate/bool/;s/default m/default y/;' net/wireguard/Kconfig +touch net/wireguard/.check From cf9175d62c98973d31bcee0ddbdf73304f43a2cf Mon Sep 17 00:00:00 2001 From: UchihaDev Date: Wed, 2 May 2018 16:10:20 +0545 Subject: [PATCH 126/146] sched/core: rebase on aosp --- kernel/sched/core.c | 154 ++++++++++++++++---------------------------- 1 file changed, 54 insertions(+), 100 deletions(-) diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 558f7bf50..a46275654 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -75,6 +75,7 @@ #include #include #include +#include #include #include @@ -1196,18 +1197,6 @@ void set_cpus_allowed_common(struct task_struct *p, const struct cpumask *new_ma p->nr_cpus_allowed = cpumask_weight(new_mask); } -static const struct cpumask *adjust_cpumask(const struct task_struct *p, - const struct cpumask *old_mask) -{ - static const unsigned long allowed_cpus = 0xf; - - if (!(p->flags & PF_KTHREAD) || p->kthread_per_cpu) - return old_mask; - - /* Force as many kthreads as possible to run on the little cluster */ - return to_cpumask(&allowed_cpus); -} - void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask) { struct rq *rq = task_rq(p); @@ -1215,7 +1204,6 @@ void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask) lockdep_assert_held(&p->pi_lock); - new_mask = adjust_cpumask(p, new_mask); queued = task_on_rq_queued(p); running = task_current(rq, p); @@ -1427,8 +1415,6 @@ int migrate_swap(struct task_struct *cur, struct task_struct *p) struct migration_swap_arg arg; int ret = -EINVAL; - get_online_cpus(); - arg = (struct migration_swap_arg){ .src_task = cur, .src_cpu = task_cpu(cur), @@ -1439,6 +1425,10 @@ int migrate_swap(struct task_struct *cur, struct task_struct *p) if (arg.src_cpu == arg.dst_cpu) goto out; + /* + * These three tests are all lockless; this is OK since all of them + * will be re-checked with proper locks held further down the line. + */ if (!cpu_active(arg.src_cpu) || !cpu_active(arg.dst_cpu)) goto out; @@ -1452,7 +1442,6 @@ int migrate_swap(struct task_struct *cur, struct task_struct *p) ret = stop_two_cpus(arg.dst_cpu, arg.src_cpu, migrate_swap_stop, &arg); out: - put_online_cpus(); return ret; } @@ -1674,9 +1663,8 @@ int select_task_rq(struct task_struct *p, int cpu, int sd_flags, int wake_flags, lockdep_assert_held(&p->pi_lock); if (p->nr_cpus_allowed > 1) - cpu = p->sched_class->select_task_rq(p, cpu, sd_flags, wake_flags, - sibling_count_hint); - + cpu = p->sched_class->select_task_rq(p, cpu, sd_flags, wake_flags, + sibling_count_hint); /* * In order not to call set_task_cpu() on a blocking task we need @@ -2218,8 +2206,8 @@ static void __sched_fork(unsigned long clone_flags, struct task_struct *p) memset(&p->se.statistics, 0, sizeof(p->se.statistics)); #endif -#ifdef CONFIG_CPU_FREQ_STAT - cpufreq_task_stats_init(p); +#ifdef CONFIG_CPU_FREQ_TIMES + cpufreq_task_times_init(p); #endif RB_CLEAR_NODE(&p->dl.rb_node); @@ -2301,11 +2289,11 @@ int sched_fork(unsigned long clone_flags, struct task_struct *p) __sched_fork(clone_flags, p); /* - * We mark the process as running here. This guarantees that + * We mark the process as NEW here. This guarantees that * nobody will actually run it, and a signal or other external * event cannot wake it up and insert it on the runqueue either. */ - p->state = TASK_RUNNING; + p->state = TASK_NEW; /* * Make sure we do not leak PI boosting priority to the child. @@ -2342,6 +2330,8 @@ int sched_fork(unsigned long clone_flags, struct task_struct *p) p->sched_class = &fair_sched_class; } + init_entity_runnable_average(&p->se); + /* * The child is not yet in the pid-hash so no cgroup attach races, * and the cgroup is pinned to this child due to cgroup_fork() @@ -2475,11 +2465,6 @@ static int dl_overflow(struct task_struct *p, int policy, extern void init_dl_bw(struct dl_bw *dl_b); -#ifdef CONFIG_HISI_EAS_SCHED - #define task_should_forkboost(task) \ - ((task && task->parent && task->parent->pid > 2)) -#endif - /* * wake_up_new_task - wake up a newly created task for the first time. * @@ -2493,22 +2478,13 @@ void wake_up_new_task(struct task_struct *p) struct rq *rq; raw_spin_lock_irqsave(&p->pi_lock, flags); + p->state = TASK_RUNNING; walt_init_new_task_load(p); /* Initialize new task's runnable average */ -#ifdef CONFIG_HISI_EAS_SCHED - if (task_should_forkboost(p)) { - init_entity_runnable_average(&p->se); - } else { - struct sched_entity *se= &p->se; - struct sched_avg *sa= &se->avg; - memset(sa, 0, sizeof(*sa)); - } -#else init_entity_runnable_average(&p->se); -#endif #ifdef CONFIG_SMP /* @@ -4634,13 +4610,11 @@ long sched_setaffinity(pid_t pid, const struct cpumask *in_mask) struct task_struct *p; int retval; - get_online_cpus(); rcu_read_lock(); p = find_process_by_pid(pid); if (!p) { rcu_read_unlock(); - put_online_cpus(); return -ESRCH; } @@ -4716,7 +4690,6 @@ long sched_setaffinity(pid_t pid, const struct cpumask *in_mask) free_cpumask_var(cpus_allowed); out_put_task: put_task_struct(p); - put_online_cpus(); return retval; } @@ -4761,7 +4734,6 @@ long sched_getaffinity(pid_t pid, struct cpumask *mask) unsigned long flags; int retval; - get_online_cpus(); rcu_read_lock(); retval = -ESRCH; @@ -4774,12 +4746,11 @@ long sched_getaffinity(pid_t pid, struct cpumask *mask) goto out_unlock; raw_spin_lock_irqsave(&p->pi_lock, flags); - cpumask_and(mask, &p->cpus_allowed, cpu_online_mask); + cpumask_and(mask, &p->cpus_allowed, cpu_active_mask); raw_spin_unlock_irqrestore(&p->pi_lock, flags); out_unlock: rcu_read_unlock(); - put_online_cpus(); return retval; } @@ -5144,8 +5115,17 @@ void sched_show_task(struct task_struct *p) state = __ffs(state) + 1; printk(KERN_INFO "%-15.15s %c", p->comm, state < sizeof(stat_nam) - 1 ? stat_nam[state] : '?'); +#if BITS_PER_LONG == 32 + if (state == TASK_RUNNING) + printk(KERN_CONT " running "); + else + printk(KERN_CONT " %08lx ", thread_saved_pc(p)); +#else if (state == TASK_RUNNING) printk(KERN_CONT " running task "); + else + printk(KERN_CONT " %016lx ", thread_saved_pc(p)); +#endif #ifdef CONFIG_DEBUG_STACK_USAGE free = stack_not_used(p); #endif @@ -5187,9 +5167,6 @@ void show_state_filter(unsigned long state_filter) if (!state_filter || (p->state & state_filter)) sched_show_task(p); - /* show 'init' state always */ - if (p->pid == 1) - sched_show_task(p); } #ifdef CONFIG_SCHED_DEBUG @@ -5383,7 +5360,6 @@ void sched_setnuma(struct task_struct *p, int nid) unsigned long flags; bool queued, running; - new_mask = adjust_cpumask(p, new_mask); rq = task_rq_lock(p, &flags); queued = task_on_rq_queued(p); running = task_current(rq, p); @@ -6298,8 +6274,6 @@ static void free_sched_domain(struct rcu_head *rcu) kfree(sd->groups->sgc); kfree(sd->groups); } - if (sd->shared && atomic_dec_and_test(&sd->shared->ref)) - kfree(sd->shared); kfree(sd); } @@ -6818,9 +6792,6 @@ static void claim_allocations(int cpu, struct sched_domain *sd) WARN_ON_ONCE(*per_cpu_ptr(sdd->sd, cpu) != sd); *per_cpu_ptr(sdd->sd, cpu) = NULL; - if (atomic_read(&(*per_cpu_ptr(sdd->sds, cpu))->ref)) - *per_cpu_ptr(sdd->sds, cpu) = NULL; - if (atomic_read(&(*per_cpu_ptr(sdd->sg, cpu))->ref)) *per_cpu_ptr(sdd->sg, cpu) = NULL; @@ -6868,12 +6839,10 @@ static int sched_domains_curr_level; static struct sched_domain * sd_init(struct sched_domain_topology_level *tl, - const struct cpumask *cpu_map, struct sched_domain *child, int cpu) { - struct sd_data *sdd = &tl->data; - struct sched_domain *sd = *per_cpu_ptr(sdd->sd, cpu); - int sd_id, sd_weight, sd_flags = 0; + struct sched_domain *sd = *per_cpu_ptr(tl->data.sd, cpu); + int sd_weight, sd_flags = 0; #ifdef CONFIG_NUMA /* @@ -6928,9 +6897,6 @@ sd_init(struct sched_domain_topology_level *tl, #endif }; - cpumask_and(sched_domain_span(sd), cpu_map, tl->mask(cpu)); - sd_id = cpumask_first(sched_domain_span(sd)); - /* * Convert topological properties into behaviour. */ @@ -6973,16 +6939,7 @@ sd_init(struct sched_domain_topology_level *tl, sd->idle_idx = 1; } - /* - * For all levels sharing cache; connect a sched_domain_shared - * instance. - */ - sd->shared = *per_cpu_ptr(sdd->sds, sd_id); - atomic_inc(&sd->shared->ref); - if (sd->flags & SD_SHARE_PKG_RESOURCES) - atomic_inc(&sd->shared->ref); - - sd->private = sdd; + sd->private = &tl->data; return sd; } @@ -7320,10 +7277,6 @@ static int __sdt_alloc(const struct cpumask *cpu_map) if (!sdd->sd) return -ENOMEM; - sdd->sds = alloc_percpu(struct sched_domain_shared *); - if (!sdd->sds) - return -ENOMEM; - sdd->sg = alloc_percpu(struct sched_group *); if (!sdd->sg) return -ENOMEM; @@ -7334,7 +7287,6 @@ static int __sdt_alloc(const struct cpumask *cpu_map) for_each_cpu(j, cpu_map) { struct sched_domain *sd; - struct sched_domain_shared *sds; struct sched_group *sg; struct sched_group_capacity *sgc; @@ -7345,13 +7297,6 @@ static int __sdt_alloc(const struct cpumask *cpu_map) *per_cpu_ptr(sdd->sd, j) = sd; - sds = kzalloc_node(sizeof(struct sched_domain_shared), - GFP_KERNEL, cpu_to_node(j)); - if (!sds) - return -ENOMEM; - - *per_cpu_ptr(sdd->sds, j) = sds; - sg = kzalloc_node(sizeof(struct sched_group) + cpumask_size(), GFP_KERNEL, cpu_to_node(j)); if (!sg) @@ -7391,8 +7336,6 @@ static void __sdt_free(const struct cpumask *cpu_map) kfree(*per_cpu_ptr(sdd->sd, j)); } - if (sdd->sds) - kfree(*per_cpu_ptr(sdd->sds, j)); if (sdd->sg) kfree(*per_cpu_ptr(sdd->sg, j)); if (sdd->sgc) @@ -7400,8 +7343,6 @@ static void __sdt_free(const struct cpumask *cpu_map) } free_percpu(sdd->sd); sdd->sd = NULL; - free_percpu(sdd->sds); - sdd->sds = NULL; free_percpu(sdd->sg); sdd->sg = NULL; free_percpu(sdd->sgc); @@ -7413,15 +7354,14 @@ struct sched_domain *build_sched_domain(struct sched_domain_topology_level *tl, const struct cpumask *cpu_map, struct sched_domain_attr *attr, struct sched_domain *child, int cpu) { - struct sched_domain *sd = sd_init(tl, cpu_map, child, cpu); - if (!sd) - return child; + struct sched_domain *sd = sd_init(tl, child, cpu); + + cpumask_and(sched_domain_span(sd), cpu_map, tl->mask(cpu)); if (child) { sd->level = child->level + 1; sched_domain_level_max = max(sched_domain_level_max, sd->level); child->parent = sd; - sd->child = child; if (!cpumask_subset(sched_domain_span(child), sched_domain_span(sd))) { @@ -7800,14 +7740,17 @@ void __init sched_init_smp(void) sched_init_numa(); - get_online_cpus(); + /* + * There's no userspace yet to cause hotplug operations; hence all the + * cpu masks are stable and all blatant races in the below code cannot + * happen. + */ mutex_lock(&sched_domains_mutex); init_sched_domains(cpu_active_mask); cpumask_andnot(non_isolated_cpus, cpu_possible_mask, cpu_isolated_map); if (cpumask_empty(non_isolated_cpus)) cpumask_set_cpu(smp_processor_id(), non_isolated_cpus); mutex_unlock(&sched_domains_mutex); - put_online_cpus(); hotcpu_notifier(sched_domains_numa_masks_update, CPU_PRI_SCHED_ACTIVE); hotcpu_notifier(cpuset_cpu_active, CPU_PRI_CPUSET_ACTIVE); @@ -8275,11 +8218,6 @@ void sched_offline_group(struct task_group *tg) spin_unlock_irqrestore(&task_group_lock, flags); } -/* change task's runqueue when it moves between groups. - * The caller of this function should have put the task in its new group - * by now. This function just updates tsk->se.cfs_rq and tsk->se.parent to - * reflect its new group. - */ static void sched_change_group(struct task_struct *tsk, int type) { struct task_group *tg; @@ -8321,7 +8259,7 @@ void sched_move_task(struct task_struct *tsk) queued = task_on_rq_queued(tsk); if (queued) - dequeue_task(rq, tsk, flags); + dequeue_task(rq, tsk, DEQUEUE_SAVE); if (unlikely(running)) put_prev_task(rq, tsk); @@ -8783,6 +8721,7 @@ static int cpu_cgroup_can_attach(struct cgroup_taskset *tset) { struct task_struct *task; struct cgroup_subsys_state *css; + int ret = 0; cgroup_taskset_for_each(task, css, tset) { #ifdef CONFIG_RT_GROUP_SCHED @@ -8793,8 +8732,24 @@ static int cpu_cgroup_can_attach(struct cgroup_taskset *tset) if (task->sched_class != &fair_sched_class) return -EINVAL; #endif + /* + * Serialize against wake_up_new_task() such that if its + * running, we're sure to observe its full state. + */ + raw_spin_lock_irq(&task->pi_lock); + /* + * Avoid calling sched_move_task() before wake_up_new_task() + * has happened. This would lead to problems with PELT, due to + * move wanting to detach+attach while we're not attached yet. + */ + if (task->state == TASK_NEW) + ret = -EINVAL; + raw_spin_unlock_irq(&task->pi_lock); + + if (ret) + break; } - return 0; + return ret; } static void cpu_cgroup_attach(struct cgroup_taskset *tset) @@ -9137,7 +9092,6 @@ struct cgroup_subsys cpu_cgrp_subsys = { .fork = cpu_cgroup_fork, .can_attach = cpu_cgroup_can_attach, .attach = cpu_cgroup_attach, - .allow_attach = subsys_cgroup_allow_attach, .legacy_cftypes = cpu_files, .early_init = 1, }; From 5dc81585e665776d430baea7af337aace2e0508f Mon Sep 17 00:00:00 2001 From: UchihaDev Date: Wed, 2 May 2018 16:27:37 +0545 Subject: [PATCH 127/146] sched/cpudeadline: rebase on aosp --- kernel/sched/cpudeadline.c | 150 ++++++++++++++----------------------- 1 file changed, 58 insertions(+), 92 deletions(-) diff --git a/kernel/sched/cpudeadline.c b/kernel/sched/cpudeadline.c index fba235c7d..dc87f30f2 100644 --- a/kernel/sched/cpudeadline.c +++ b/kernel/sched/cpudeadline.c @@ -31,81 +31,58 @@ static inline int right_child(int i) return (i << 1) + 2; } -static void cpudl_heapify_down(struct cpudl *cp, int idx) +static void cpudl_exchange(struct cpudl *cp, int a, int b) { - int l, r, largest; + int cpu_a = cp->elements[a].cpu, cpu_b = cp->elements[b].cpu; + + swap(cp->elements[a].cpu, cp->elements[b].cpu); + swap(cp->elements[a].dl , cp->elements[b].dl ); - int orig_cpu = cp->elements[idx].cpu; - u64 orig_dl = cp->elements[idx].dl; + swap(cp->elements[cpu_a].idx, cp->elements[cpu_b].idx); +} - if (left_child(idx) >= cp->size) - return; +static void cpudl_heapify(struct cpudl *cp, int idx) +{ + int l, r, largest; /* adapted from lib/prio_heap.c */ while(1) { - u64 largest_dl; l = left_child(idx); r = right_child(idx); largest = idx; - largest_dl = orig_dl; - if ((l < cp->size) && dl_time_before(orig_dl, - cp->elements[l].dl)) { + if ((l < cp->size) && dl_time_before(cp->elements[idx].dl, + cp->elements[l].dl)) largest = l; - largest_dl = cp->elements[l].dl; - } - if ((r < cp->size) && dl_time_before(largest_dl, - cp->elements[r].dl)) + + if ((r < cp->size) && dl_time_before(cp->elements[largest].dl, + cp->elements[r].dl)) largest = r; if (largest == idx) break; - /* pull largest child onto idx */ - cp->elements[idx].cpu = cp->elements[largest].cpu; - cp->elements[idx].dl = cp->elements[largest].dl; - cp->elements[cp->elements[idx].cpu].idx = idx; + /* Push idx down the heap one level and bump one up */ + cpudl_exchange(cp, largest, idx); idx = largest; } - /* actual push down of saved original values orig_* */ - cp->elements[idx].cpu = orig_cpu; - cp->elements[idx].dl = orig_dl; - cp->elements[cp->elements[idx].cpu].idx = idx; } -static void cpudl_heapify_up(struct cpudl *cp, int idx) +static void cpudl_change_key(struct cpudl *cp, int idx, u64 new_dl) { - int p; - - int orig_cpu = cp->elements[idx].cpu; - u64 orig_dl = cp->elements[idx].dl; + WARN_ON(idx == IDX_INVALID || !cpu_present(idx)); - if (idx == 0) - return; - - do { - p = parent(idx); - if (dl_time_before(orig_dl, cp->elements[p].dl)) - break; - /* pull parent onto idx */ - cp->elements[idx].cpu = cp->elements[p].cpu; - cp->elements[idx].dl = cp->elements[p].dl; - cp->elements[cp->elements[idx].cpu].idx = idx; - idx = p; - } while (idx != 0); - /* actual push up of saved original values orig_* */ - cp->elements[idx].cpu = orig_cpu; - cp->elements[idx].dl = orig_dl; - cp->elements[cp->elements[idx].cpu].idx = idx; -} - -static void cpudl_heapify(struct cpudl *cp, int idx) -{ - if (idx > 0 && dl_time_before(cp->elements[parent(idx)].dl, - cp->elements[idx].dl)) - cpudl_heapify_up(cp, idx); - else - cpudl_heapify_down(cp, idx); + if (dl_time_before(new_dl, cp->elements[idx].dl)) { + cp->elements[idx].dl = new_dl; + cpudl_heapify(cp, idx); + } else { + cp->elements[idx].dl = new_dl; + while (idx > 0 && dl_time_before(cp->elements[parent(idx)].dl, + cp->elements[idx].dl)) { + cpudl_exchange(cp, idx, parent(idx)); + idx = parent(idx); + } + } } static inline int cpudl_maximum(struct cpudl *cp) @@ -145,15 +122,16 @@ int cpudl_find(struct cpudl *cp, struct task_struct *p, } /* - * cpudl_clear - remove a cpu from the cpudl max-heap + * cpudl_set - update the cpudl max-heap * @cp: the cpudl max-heap context * @cpu: the target cpu + * @dl: the new earliest deadline for this cpu * * Notes: assumes cpu_rq(cpu)->lock is locked * * Returns: (void) */ -void cpudl_clear(struct cpudl *cp, int cpu) +void cpudl_set(struct cpudl *cp, int cpu, u64 dl, int is_valid) { int old_idx, new_cpu; unsigned long flags; @@ -163,58 +141,46 @@ void cpudl_clear(struct cpudl *cp, int cpu) raw_spin_lock_irqsave(&cp->lock, flags); old_idx = cp->elements[cpu].idx; - if (old_idx == IDX_INVALID) { - /* - * Nothing to remove if old_idx was invalid. - * This could happen if a rq_offline_dl is - * called for a CPU without -dl tasks running. - */ - } else { + if (!is_valid) { + /* remove item */ + if (old_idx == IDX_INVALID) { + /* + * Nothing to remove if old_idx was invalid. + * This could happen if a rq_offline_dl is + * called for a CPU without -dl tasks running. + */ + goto out; + } new_cpu = cp->elements[cp->size - 1].cpu; cp->elements[old_idx].dl = cp->elements[cp->size - 1].dl; cp->elements[old_idx].cpu = new_cpu; cp->size--; cp->elements[new_cpu].idx = old_idx; cp->elements[cpu].idx = IDX_INVALID; - cpudl_heapify(cp, old_idx); + while (old_idx > 0 && dl_time_before( + cp->elements[parent(old_idx)].dl, + cp->elements[old_idx].dl)) { + cpudl_exchange(cp, old_idx, parent(old_idx)); + old_idx = parent(old_idx); + } cpumask_set_cpu(cpu, cp->free_cpus); + cpudl_heapify(cp, old_idx); + goto out; } - raw_spin_unlock_irqrestore(&cp->lock, flags); -} -/* - * cpudl_set - update the cpudl max-heap - * @cp: the cpudl max-heap context - * @cpu: the target cpu - * @dl: the new earliest deadline for this cpu - * - * Notes: assumes cpu_rq(cpu)->lock is locked - * - * Returns: (void) - */ -void cpudl_set(struct cpudl *cp, int cpu, u64 dl) -{ - int old_idx; - unsigned long flags; - - WARN_ON(!cpu_present(cpu)); - - raw_spin_lock_irqsave(&cp->lock, flags); - - old_idx = cp->elements[cpu].idx; if (old_idx == IDX_INVALID) { - int new_idx = cp->size++; - cp->elements[new_idx].dl = dl; - cp->elements[new_idx].cpu = cpu; - cp->elements[cpu].idx = new_idx; - cpudl_heapify_up(cp, new_idx); + cp->size++; + cp->elements[cp->size - 1].dl = 0; + cp->elements[cp->size - 1].cpu = cpu; + cp->elements[cpu].idx = cp->size - 1; + cpudl_change_key(cp, cp->size - 1, dl); cpumask_clear_cpu(cpu, cp->free_cpus); } else { - cp->elements[old_idx].dl = dl; - cpudl_heapify(cp, old_idx); + cpudl_change_key(cp, old_idx, dl); } +out: raw_spin_unlock_irqrestore(&cp->lock, flags); } From a9c921e72ebd2f502844c2c0a0c2375dcdba828f Mon Sep 17 00:00:00 2001 From: UchihaDev Date: Wed, 2 May 2018 16:28:57 +0545 Subject: [PATCH 128/146] sched/cpudeadline.h: rebase on aosp --- kernel/sched/cpudeadline.h | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/kernel/sched/cpudeadline.h b/kernel/sched/cpudeadline.h index f7da8c55b..fcbdf83fe 100644 --- a/kernel/sched/cpudeadline.h +++ b/kernel/sched/cpudeadline.h @@ -23,8 +23,7 @@ struct cpudl { #ifdef CONFIG_SMP int cpudl_find(struct cpudl *cp, struct task_struct *p, struct cpumask *later_mask); -void cpudl_set(struct cpudl *cp, int cpu, u64 dl); -void cpudl_clear(struct cpudl *cp, int cpu); +void cpudl_set(struct cpudl *cp, int cpu, u64 dl, int is_valid); int cpudl_init(struct cpudl *cp); void cpudl_set_freecpu(struct cpudl *cp, int cpu); void cpudl_clear_freecpu(struct cpudl *cp, int cpu); From 184ecdff18dcebbd5d152b32976e6ae66e606d37 Mon Sep 17 00:00:00 2001 From: UchihaDev Date: Wed, 2 May 2018 16:31:49 +0545 Subject: [PATCH 129/146] sched/cputime: rebase on aosp --- kernel/sched/cputime.c | 14 ++++++-------- 1 file changed, 6 insertions(+), 8 deletions(-) diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c index 9d1e766d7..efe7a210c 100644 --- a/kernel/sched/cputime.c +++ b/kernel/sched/cputime.c @@ -7,9 +7,7 @@ #include #include "sched.h" #include "walt.h" -#ifdef CONFIG_CPU_FREQ_POWER_STAT -#include -#endif +#include #ifdef CONFIG_IRQ_TIME_ACCOUNTING @@ -169,9 +167,9 @@ void account_user_time(struct task_struct *p, cputime_t cputime, /* Account for user time used */ acct_account_cputime(p); -#ifdef CONFIG_CPU_FREQ_STAT - /* Account power usage for system time */ - acct_update_power(p, cputime); +#ifdef CONFIG_CPU_FREQ_TIMES + /* Account power usage for user time */ + cpufreq_acct_update_power(p, cputime); #endif } @@ -224,9 +222,9 @@ void __account_system_time(struct task_struct *p, cputime_t cputime, /* Account for system time used */ acct_account_cputime(p); -#ifdef CONFIG_CPU_FREQ_STAT +#ifdef CONFIG_CPU_FREQ_TIMES /* Account power usage for system time */ - acct_update_power(p, cputime); + cpufreq_acct_update_power(p, cputime); #endif } From 8ac821d83a524cab68db39314388ebc886e67a8a Mon Sep 17 00:00:00 2001 From: UchihaDev Date: Wed, 2 May 2018 16:33:30 +0545 Subject: [PATCH 130/146] sched/deadline: rebase on aosp --- kernel/sched/deadline.c | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index 1af035bd9..5c6ffddca 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c @@ -949,7 +949,7 @@ static void inc_dl_deadline(struct dl_rq *dl_rq, u64 deadline) */ dl_rq->earliest_dl.next = dl_rq->earliest_dl.curr; dl_rq->earliest_dl.curr = deadline; - cpudl_set(&rq->rd->cpudl, rq->cpu, deadline); + cpudl_set(&rq->rd->cpudl, rq->cpu, deadline, 1); } else if (dl_rq->earliest_dl.next == 0 || dl_time_before(deadline, dl_rq->earliest_dl.next)) { /* @@ -973,7 +973,7 @@ static void dec_dl_deadline(struct dl_rq *dl_rq, u64 deadline) if (!dl_rq->dl_nr_running) { dl_rq->earliest_dl.curr = 0; dl_rq->earliest_dl.next = 0; - cpudl_clear(&rq->rd->cpudl, rq->cpu); + cpudl_set(&rq->rd->cpudl, rq->cpu, 0, 0); } else { struct rb_node *leftmost = dl_rq->rb_leftmost; struct sched_dl_entity *entry; @@ -981,7 +981,7 @@ static void dec_dl_deadline(struct dl_rq *dl_rq, u64 deadline) entry = rb_entry(leftmost, struct sched_dl_entity, rb_node); dl_rq->earliest_dl.curr = entry->deadline; dl_rq->earliest_dl.next = next_deadline(rq); - cpudl_set(&rq->rd->cpudl, rq->cpu, entry->deadline); + cpudl_set(&rq->rd->cpudl, rq->cpu, entry->deadline, 1); } } @@ -1600,7 +1600,6 @@ static struct rq *find_lock_later_rq(struct task_struct *task, struct rq *rq) !cpumask_test_cpu(later_rq->cpu, &task->cpus_allowed) || task_running(rq, task) || - !dl_task(task) || !task_on_rq_queued(task))) { double_unlock_balance(rq, later_rq); later_rq = NULL; @@ -1881,7 +1880,7 @@ static void rq_online_dl(struct rq *rq) cpudl_set_freecpu(&rq->rd->cpudl, rq->cpu); if (rq->dl.dl_nr_running > 0) - cpudl_set(&rq->rd->cpudl, rq->cpu, rq->dl.earliest_dl.curr); + cpudl_set(&rq->rd->cpudl, rq->cpu, rq->dl.earliest_dl.curr, 1); } /* Assumes rq->lock is held */ @@ -1890,7 +1889,7 @@ static void rq_offline_dl(struct rq *rq) if (rq->dl.overloaded) dl_clear_overload(rq); - cpudl_clear(&rq->rd->cpudl, rq->cpu); + cpudl_set(&rq->rd->cpudl, rq->cpu, 0, 0); cpudl_clear_freecpu(&rq->rd->cpudl, rq->cpu); } From abc17cf5da6d1abfdd7727d77990a5a9aceb9b27 Mon Sep 17 00:00:00 2001 From: UchihaDev Date: Wed, 2 May 2018 16:37:48 +0545 Subject: [PATCH 131/146] sched/debug: rebase on aosp --- kernel/sched/debug.c | 3 --- 1 file changed, 3 deletions(-) diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c index e95cddf57..7f7116622 100644 --- a/kernel/sched/debug.c +++ b/kernel/sched/debug.c @@ -618,9 +618,6 @@ void proc_sched_show_task(struct task_struct *p, struct seq_file *m) P(se.statistics.nr_wakeups_fbt_no_cpu); P(se.statistics.nr_wakeups_fbt_no_sd); P(se.statistics.nr_wakeups_fbt_pref_idle); - P(se.statistics.nr_wakeups_fbt_pref_idle_lum); - P(se.statistics.nr_wakeups_fbt_best_active); - P(se.statistics.nr_wakeups_fbt_best_idle); P(se.statistics.nr_wakeups_fbt_count); /* cas */ /* select_task_rq_fair() */ From 59441b323a047a7d4d474453bd065f8828a2a098 Mon Sep 17 00:00:00 2001 From: UchihaDev Date: Wed, 2 May 2018 16:38:51 +0545 Subject: [PATCH 132/146] sched/energy: rebase on aosp --- kernel/sched/energy.c | 26 -------------------------- 1 file changed, 26 deletions(-) diff --git a/kernel/sched/energy.c b/kernel/sched/energy.c index 2c3553556..b0656b7a9 100644 --- a/kernel/sched/energy.c +++ b/kernel/sched/energy.c @@ -46,30 +46,6 @@ static void free_resources(void) } } -static void dump_energy_model(void) -{ - int cpu, sd_level, idx; - struct sched_group_energy *sge; - - for_each_possible_cpu(cpu) { - for_each_possible_sd_level(sd_level) { - sge = sge_array[cpu][sd_level]; - if (!sge) - continue; - - pr_info("EAS: cpu %d sd_level = %d\n", cpu, sd_level); - for (idx = 0; idx < sge->nr_idle_states; idx++) - pr_info("Idle state [%d] = p %lu\n", idx, - sge->idle_states[idx].power); - - for (idx = 0; idx < sge->nr_cap_states; idx++) - pr_info("Idle state [%d] = c %lu p %lu\n", idx, - sge->cap_states[idx].cap, - sge->cap_states[idx].power); - } - } -} - void init_sched_energy_costs(void) { struct device_node *cn, *cp; @@ -140,8 +116,6 @@ void init_sched_energy_costs(void) } } - dump_energy_model(); - pr_info("Sched-energy-costs installed from DT\n"); return; From 4eb4f4c31c87963100de47aab03bd869b76e5bf1 Mon Sep 17 00:00:00 2001 From: UchihaDev Date: Wed, 2 May 2018 18:51:56 +0545 Subject: [PATCH 133/146] sched/fair: rebase on aosp (not complete) --- kernel/sched/fair.c | 1070 ++++++++++++++++--------------------------- 1 file changed, 398 insertions(+), 672 deletions(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index c21a1b14c..6fe74de98 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -64,25 +64,11 @@ unsigned int normalized_sysctl_sched_latency = 6000000ULL; #endif unsigned int sysctl_sched_sync_hint_enable = 1; -unsigned int sysctl_sched_cstate_aware = 0; - -#ifdef CONFIG_HISI_EAS_SCHED -int global_boost_enabled_flag = 0; -int boot_boost = 1; -unsigned int sd_capacity_margin = 1280; -unsigned long up_migration_util_filter = 25; -int hisi_test_fast_cpu(int cpu); -void hisi_get_fast_cpus(struct cpumask *cpumask); -#endif +unsigned int sysctl_sched_cstate_aware = 1; #ifdef CONFIG_SCHED_WALT -#ifdef CONFIG_SCHED_WALT_DEFAULT unsigned int sysctl_sched_use_walt_cpu_util = 1; unsigned int sysctl_sched_use_walt_task_util = 1; -#else -unsigned int sysctl_sched_use_walt_cpu_util = 0; -unsigned int sysctl_sched_use_walt_task_util = 0; -#endif __read_mostly unsigned int sysctl_sched_walt_cpu_high_irqload = (10 * NSEC_PER_MSEC); #endif @@ -142,8 +128,7 @@ const_debug unsigned int sysctl_sched_migration_cost = 250000UL; unsigned int sysctl_sched_wakeup_granularity = 1000000UL; unsigned int normalized_sysctl_sched_wakeup_granularity = 1000000UL; -/*const_debug unsigned int sysctl_sched_migration_cost = 500000UL;*/ -const_debug unsigned int sysctl_sched_migration_cost = 0UL; +const_debug unsigned int sysctl_sched_migration_cost = 500000UL; #endif /* @@ -171,6 +156,12 @@ unsigned int sysctl_sched_cfs_bandwidth_slice = 5000UL; #endif #endif +/* + * The margin used when comparing utilization with CPU capacity: + * util * margin < capacity * 1024 + */ +unsigned int capacity_margin = 1280; /* ~20% */ + static inline void update_load_add(struct load_weight *lw, unsigned long inc) { lw->weight += inc; @@ -786,6 +777,13 @@ void init_entity_runnable_average(struct sched_entity *se) if (entity_is_task(se)) sa->load_avg = scale_load_down(se->load.weight); sa->load_sum = sa->load_avg * LOAD_AVG_MAX; + /* + * In previous Android versions, we used to have: + * sa->util_avg = scale_load_down(SCHED_LOAD_SCALE); + * sa->util_sum = sa->util_avg * LOAD_AVG_MAX; + * However, that functionality has been moved to enqueue. + * It is unclear if we should restore this in enqueue. + */ /* * At this point, util_avg won't be used in select_task_rq_fair anyway */ @@ -794,6 +792,11 @@ void init_entity_runnable_average(struct sched_entity *se) /* when this task enqueue'ed, it will contribute to its cfs_rq's load_avg */ } +static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq); +static int update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq, bool update_freq); +static void attach_entity_cfs_rq(struct sched_entity *se); +static void attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se); + /* * With new tasks being created, their initial util_avgs are extrapolated * based on the cfs_rq's current util_avg: @@ -823,7 +826,7 @@ void post_init_entity_util_avg(struct sched_entity *se) { struct cfs_rq *cfs_rq = cfs_rq_of(se); struct sched_avg *sa = &se->avg; - long cap = (long)(scale_load_down(SCHED_LOAD_SCALE) - cfs_rq->avg.util_avg) / 2; + long cap = (long)(SCHED_CAPACITY_SCALE - cfs_rq->avg.util_avg) / 2; if (cap > 0) { if (cfs_rq->avg.util_avg != 0) { @@ -835,18 +838,45 @@ void post_init_entity_util_avg(struct sched_entity *se) } else { sa->util_avg = cap; } + /* + * If we wish to restore tuning via setting initial util, + * this is where we should do it. + */ sa->util_sum = sa->util_avg * LOAD_AVG_MAX; } + + if (entity_is_task(se)) { + struct task_struct *p = task_of(se); + if (p->sched_class != &fair_sched_class) { + /* + * For !fair tasks do: + * + update_cfs_rq_load_avg(now, cfs_rq, false); + attach_entity_load_avg(cfs_rq, se); + switched_from_fair(rq, p); + * + * such that the next switched_to_fair() has the + * expected state. + */ + se->avg.last_update_time = cfs_rq_clock_task(cfs_rq); + return; + } + } + + attach_entity_cfs_rq(se); } -#else +#else /* !CONFIG_SMP */ void init_entity_runnable_average(struct sched_entity *se) { } void post_init_entity_util_avg(struct sched_entity *se) { } -#endif +static void update_tg_load_avg(struct cfs_rq *cfs_rq, int force) +{ +} +#endif /* CONFIG_SMP */ /* * Update the current task's runtime statistics. @@ -3162,10 +3192,6 @@ update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq, bool update_freq) if (update_freq && (decayed || removed_util)) cfs_rq_util_change(cfs_rq); - /* Trace CPU load, unless cfs_rq belongs to a non-root task_group */ - if (cfs_rq == &rq_of(cfs_rq)->cfs) - trace_sched_load_avg_cpu(cpu_of(rq_of(cfs_rq)), cfs_rq); - return decayed || removed; } @@ -3174,7 +3200,6 @@ update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq, bool update_freq) */ #define UPDATE_TG 0x1 #define SKIP_AGE_LOAD 0x2 -#define SKIP_CPUFREQ 0x4 /* Update task and its cfs_rq load average */ static inline void update_load_avg(struct sched_entity *se, int flags) @@ -3195,7 +3220,7 @@ static inline void update_load_avg(struct sched_entity *se, int flags) cfs_rq->curr == se, NULL); } - decayed = update_cfs_rq_load_avg(now, cfs_rq, !(flags & SKIP_CPUFREQ)); + decayed = update_cfs_rq_load_avg(now, cfs_rq, true); decayed |= propagate_entity_load_avg(se); if (decayed && (flags & UPDATE_TG)) @@ -3315,18 +3340,18 @@ void sync_entity_load_avg(struct sched_entity *se) void remove_entity_load_avg(struct sched_entity *se) { struct cfs_rq *cfs_rq = cfs_rq_of(se); - u64 last_update_time; /* - * Newly created task or never used group entity should not be removed - * from its (source) cfs_rq + * tasks cannot exit without having gone through wake_up_new_task() -> + * post_init_entity_util_avg() which will have added things to the + * cfs_rq, so we can remove unconditionally. + * + * Similarly for groups, they will have passed through + * post_init_entity_util_avg() before unregister_sched_fair_group() + * calls this. */ - if (se->avg.last_update_time == 0) - return; - - last_update_time = cfs_rq_last_update_time(cfs_rq); - __update_load_avg(last_update_time, cpu_of(rq_of(cfs_rq)), &se->avg, 0, 0, NULL); + sync_entity_load_avg(se); atomic_long_add(se->avg.load_avg, &cfs_rq->removed_load_avg); atomic_long_add(se->avg.util_avg, &cfs_rq->removed_util_avg); } @@ -3371,7 +3396,6 @@ update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq, bool update_freq) #define UPDATE_TG 0x0 #define SKIP_AGE_LOAD 0x0 -#define SKIP_CPUFREQ 0x0 static inline void update_load_avg(struct sched_entity *se, int not_used1){} static inline void @@ -3588,7 +3612,6 @@ static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq); static void dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) { - int update_flags; /* * Update run-time statistics of the 'current'. @@ -3603,12 +3626,7 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) * - For group entity, update its weight to reflect the new share * of its group cfs_rq. */ - update_flags = UPDATE_TG; - - if (flags & DEQUEUE_IDLE) - update_flags |= SKIP_CPUFREQ; - - update_load_avg(se, update_flags); + update_load_avg(se, UPDATE_TG); dequeue_entity_load_avg(cfs_rq, se); update_stats_dequeue(cfs_rq, se); @@ -4631,7 +4649,7 @@ static void hrtick_start_fair(struct rq *rq, struct task_struct *p) WARN_ON(task_rq(p) != rq); - if (rq->cfs.h_nr_running > 1) { + if (cfs_rq->nr_running > 1) { u64 slice = sched_slice(cfs_rq, se); u64 ran = se->sum_exec_runtime - se->prev_sum_exec_runtime; s64 delta = slice - ran; @@ -4673,41 +4691,13 @@ static inline void hrtick_update(struct rq *rq) #ifdef CONFIG_SMP -static inline long -schedtune_task_margin(struct task_struct *p); static bool __cpu_overutilized(int cpu, int delta); static bool cpu_overutilized(int cpu); -static bool cpu_halfutilized(int cpu); -static bool need_spread_task(int cpu); unsigned long boosted_cpu_util(int cpu); #else #define boosted_cpu_util(cpu) cpu_util_freq(cpu) #endif -static inline bool -is_sd_overutilized(struct sched_domain *sd) -{ - if (sd) - return sd->shared->overutilized; - else - return false; -} - -static inline void -set_sd_overutilized(struct sched_domain *sd) -{ - if (sd) - sd->shared->overutilized = true; -} - -static inline void -clear_sd_overutilized(struct sched_domain *sd) -{ - if (sd) - sd->shared->overutilized = false; -} - - /* * The enqueue_task method is called before nr_running is * increased. Here we update the fair scheduling stats and @@ -4717,29 +4707,10 @@ static void enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags) { struct cfs_rq *cfs_rq; - struct sched_domain *sd; struct sched_entity *se = &p->se; #ifdef CONFIG_SMP int task_new = flags & ENQUEUE_WAKEUP_NEW; - /* - * Update SchedTune accounting. - * - * We do it before updating the CPU capacity to ensure the - * boost value of the current task is accounted for in the - * selection of the OPP. - * - * We do it also in the case where we enqueue a throttled task; - * we could argue that a throttled task should not boost a CPU, - * however: - * a) properly implementing CPU boosting considering throttled - * tasks will increase a lot the complexity of the solution - * b) it's not easy to quantify the benefits introduced by - * such a more complex solution. - * Thus, for the time being we go for the simple solution and boost - * also for throttled RQs. - */ - schedtune_enqueue_task(p, cpu_of(rq)); #endif /* @@ -4789,16 +4760,32 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags) add_nr_running(rq, 1); #ifdef CONFIG_SMP + /* + * Update SchedTune accounting. + * + * We do it before updating the CPU capacity to ensure the + * boost value of the current task is accounted for in the + * selection of the OPP. + * + * We do it also in the case where we enqueue a throttled task; + * we could argue that a throttled task should not boost a CPU, + * however: + * a) properly implementing CPU boosting considering throttled + * tasks will increase a lot the complexity of the solution + * b) it's not easy to quantify the benefits introduced by + * such a more complex solution. + * Thus, for the time being we go for the simple solution and boost + * also for throttled RQs. + */ + schedtune_enqueue_task(p, cpu_of(rq)); + if (!se) { walt_inc_cumulative_runnable_avg(rq, p); - - rcu_read_lock(); - sd = rcu_dereference(rq->sd); - if (!task_new && !is_sd_overutilized(sd) && - cpu_overutilized(rq->cpu)) - set_sd_overutilized(sd); - rcu_read_unlock(); - + if (!task_new && !rq->rd->overutilized && + cpu_overutilized(rq->cpu)) { + rq->rd->overutilized = true; + trace_sched_overutilized(true); + } } #endif /* CONFIG_SMP */ @@ -4818,20 +4805,6 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags) struct sched_entity *se = &p->se; int task_sleep = flags & DEQUEUE_SLEEP; -#ifdef CONFIG_SMP - /* - * Update SchedTune accounting - * - * We do it before updating the CPU capacity to ensure the - * boost value of the current task is accounted for in the - * selection of the OPP. - */ - schedtune_dequeue_task(p, cpu_of(rq)); -#endif - - if (task_sleep && rq->nr_running == 1) - flags |= DEQUEUE_IDLE; - for_each_sched_entity(se) { cfs_rq = cfs_rq_of(se); dequeue_entity(cfs_rq, se, flags); @@ -4866,7 +4839,6 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags) #endif for_each_sched_entity(se) { - int update_flags; cfs_rq = cfs_rq_of(se); cfs_rq->h_nr_running--; @@ -4875,12 +4847,7 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags) if (cfs_rq_throttled(cfs_rq)) break; - update_flags = UPDATE_TG; - - if (flags & DEQUEUE_IDLE) - update_flags |= SKIP_CPUFREQ; - - update_load_avg(se, update_flags); + update_load_avg(se, UPDATE_TG); update_cfs_shares(se); } @@ -4888,6 +4855,15 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags) sub_nr_running(rq, 1); #ifdef CONFIG_SMP + /* + * Update SchedTune accounting + * + * We do it before updating the CPU capacity to ensure the + * boost value of the current task is accounted for in the + * selection of the OPP. + */ + schedtune_dequeue_task(p, cpu_of(rq)); + if (!se) walt_dec_cumulative_runnable_avg(rq, p); #endif /* CONFIG_SMP */ @@ -5291,17 +5267,6 @@ static long effective_load(struct task_group *tg, int cpu, long wl, long wg) #endif -/* - * Returns the current capacity of cpu after applying both - * cpu and freq scaling. - */ -unsigned long capacity_curr_of(int cpu) -{ - return cpu_rq(cpu)->cpu_capacity_orig * - arch_scale_freq_capacity(NULL, cpu) - >> SCHED_CAPACITY_SHIFT; -} - /* * Returns the current capacity of cpu after applying both * cpu and min freq scaling. @@ -5321,66 +5286,28 @@ static inline bool energy_aware(void) return sched_feat(ENERGY_AWARE); } -/* - * CPU candidates. - * - * These are labels to reference CPU candidates for an energy_diff. - * Currently we support only two possible candidates: the task's previous CPU - * and another candiate CPU. - * More advanced/aggressive EAS selection policies can consider more - * candidates. - */ -#define EAS_CPU_PRV 0 -#define EAS_CPU_NXT 1 -#define EAS_CPU_BKP 2 -#define EAS_CPU_CNT 3 - -/* - * energy_diff - supports the computation of the estimated energy impact in - * moving a "task"'s "util_delta" between different CPU candidates. - */ struct energy_env { - /* Utilization to move */ - struct task_struct *p; - int util_delta; - - /* Mask of CPUs candidates to evaluate */ - cpumask_t cpus_mask; - - /* CPU candidates to evaluate */ - struct { - - /* CPU ID, must be in cpus_mask */ - int cpu_id; - - /* - * Index (into sched_group_energy::cap_states) of the OPP the - * CPU needs to run at if the task is placed on it. - * This includes the both active and blocked load, due to - * other tasks on this CPU, as well as the task's own - * utilization. - */ - int cap_idx; - int cap; - - /* Estimated system energy */ - unsigned int energy; - - /* Estimated energy variation wrt EAS_CPU_PRV */ - int nrg_delta; - - } cpu[EAS_CPU_CNT]; - - /* - * Index (into energy_env::cpu) of the morst energy efficient CPU for - * the specified energy_env::task - */ - int next_idx; - - /* Support data */ struct sched_group *sg_top; struct sched_group *sg_cap; - struct sched_group *sg; + int cap_idx; + int util_delta; + int src_cpu; + int dst_cpu; + int trg_cpu; + int energy; + int payoff; + struct task_struct *task; + struct { + int before; + int after; + int delta; + int diff; + } nrg; + struct { + int before; + int after; + int delta; + } cap; }; static int cpu_util_wake(int cpu, struct task_struct *p); @@ -5408,33 +5335,25 @@ static unsigned long __cpu_norm_util(unsigned long util, unsigned long capacity) return (util << SCHED_CAPACITY_SHIFT)/capacity; } -static unsigned long group_max_util(struct energy_env *eenv, int cpu_idx) +static unsigned long group_max_util(struct energy_env *eenv) { unsigned long max_util = 0; unsigned long util; int cpu; for_each_cpu(cpu, sched_group_cpus(eenv->sg_cap)) { - util = cpu_util_wake(cpu, eenv->p); + util = cpu_util_wake(cpu, eenv->task); /* * If we are looking at the target CPU specified by the eenv, * then we should add the (estimated) utilization of the task * assuming we will wake it up on that CPU. */ - if (unlikely(cpu == eenv->cpu[cpu_idx].cpu_id)) + if (unlikely(cpu == eenv->trg_cpu)) util += eenv->util_delta; max_util = max(max_util, util); - /* - * Take into account any minimum frequency imposed - * elsewhere which limits the energy states available - * If the MIN_CAPACITY_CAPPING feature is not enabled - * capacity_min_of will return 0 (not capped). - */ - max_util = max(max_util, capacity_min_of(cpu)); - } return max_util; @@ -5452,21 +5371,21 @@ static unsigned long group_max_util(struct energy_env *eenv, int cpu_idx) * estimate (more busy). */ static unsigned -long group_norm_util(struct energy_env *eenv, int cpu_idx) +long group_norm_util(struct energy_env *eenv, struct sched_group *sg) { - unsigned long capacity = eenv->cpu[cpu_idx].cap; + unsigned long capacity = sg->sge->cap_states[eenv->cap_idx].cap; unsigned long util, util_sum = 0; int cpu; - for_each_cpu(cpu, sched_group_cpus(eenv->sg)) { - util = cpu_util_wake(cpu, eenv->p); + for_each_cpu(cpu, sched_group_cpus(sg)) { + util = cpu_util_wake(cpu, eenv->task); /* * If we are looking at the target CPU specified by the eenv, * then we should add the (estimated) utilization of the task * assuming we will wake it up on that CPU. */ - if (unlikely(cpu == eenv->cpu[cpu_idx].cpu_id)) + if (unlikely(cpu == eenv->trg_cpu)) util += eenv->util_delta; util_sum += __cpu_norm_util(util, capacity); @@ -5475,53 +5394,27 @@ long group_norm_util(struct energy_env *eenv, int cpu_idx) return min_t(unsigned long, util_sum, SCHED_CAPACITY_SCALE); } -static int find_new_capacity(struct energy_env *eenv, int cpu_idx) +static int find_new_capacity(struct energy_env *eenv, + const struct sched_group_energy * const sge) { - const struct sched_group_energy *sge = eenv->sg->sge; int idx, max_idx = sge->nr_cap_states - 1; - unsigned long util = group_max_util(eenv, cpu_idx); + unsigned long util = group_max_util(eenv); /* default is max_cap if we don't find a match */ - eenv->cpu[cpu_idx].cap_idx = max_idx; - eenv->cpu[cpu_idx].cap = sge->cap_states[max_idx].cap; + eenv->cap_idx = max_idx; for (idx = 0; idx < sge->nr_cap_states; idx++) { if (sge->cap_states[idx].cap >= util) { - /* Keep track of SG's capacity */ - eenv->cpu[cpu_idx].cap_idx = idx; - eenv->cpu[cpu_idx].cap = sge->cap_states[idx].cap; + eenv->cap_idx = idx; break; } } - return eenv->cpu[cpu_idx].cap_idx; -} - -static int find_cpu_new_capacity(int cpu, unsigned long util) -{ - struct sched_domain *sd; - const struct sched_group_energy *sge; - int idx; - - sd = rcu_dereference(per_cpu(sd_ea, cpu)); - if (!sd) - return INT_MAX; - - sge = sd->groups->sge; - - for (idx = 0; idx < sge->nr_cap_states; idx++) - if (sge->cap_states[idx].cap >= util) - break; - - if (idx == sge->nr_cap_states) - idx = idx - 1; - - return idx; + return eenv->cap_idx; } -static int group_idle_state(struct energy_env *eenv, int cpu_idx) +static int group_idle_state(struct energy_env *eenv, struct sched_group *sg) { - struct sched_group *sg = eenv->sg; int i, state = INT_MAX; int src_in_grp, dst_in_grp; long grp_util = 0; @@ -5533,10 +5426,8 @@ static int group_idle_state(struct energy_env *eenv, int cpu_idx) /* Take non-cpuidle idling into account (active idle/arch_cpu_idle()) */ state++; - src_in_grp = cpumask_test_cpu(eenv->cpu[EAS_CPU_PRV].cpu_id, - sched_group_cpus(sg)); - dst_in_grp = cpumask_test_cpu(eenv->cpu[cpu_idx].cpu_id, - sched_group_cpus(sg)); + src_in_grp = cpumask_test_cpu(eenv->src_cpu, sched_group_cpus(sg)); + dst_in_grp = cpumask_test_cpu(eenv->dst_cpu, sched_group_cpus(sg)); if (src_in_grp == dst_in_grp) { /* both CPUs under consideration are in the same group or not in * either group, migration should leave idle state the same. @@ -5549,8 +5440,8 @@ static int group_idle_state(struct energy_env *eenv, int cpu_idx) * achievable when we move the task. */ for_each_cpu(i, sched_group_cpus(sg)) { - grp_util += cpu_util_wake(i, eenv->p); - if (unlikely(i == eenv->cpu[cpu_idx].cpu_id)) + grp_util += cpu_util_wake(i, eenv->task); + if (unlikely(i == eenv->trg_cpu)) grp_util += eenv->util_delta; } @@ -5586,65 +5477,19 @@ static int group_idle_state(struct energy_env *eenv, int cpu_idx) } /* - * calc_sg_energy: compute energy for the eenv's SG (i.e. eenv->sg). - * - * This works in iterations to compute the SG's energy for each CPU - * candidate defined by the energy_env's cpu array. - * - * NOTE: in the following computations for busy_energy and idle_energy we do - * not shift by SCHED_CAPACITY_SHIFT in order to reduce rounding errors. - * The required scaling will be performed just one time, by the calling - * functions, once we accumulated the contributons for all the SGs. + * sched_group_energy(): Computes the absolute energy consumption of cpus + * belonging to the sched_group including shared resources shared only by + * members of the group. Iterates over all cpus in the hierarchy below the + * sched_group starting from the bottom working it's way up before going to + * the next cpu until all cpus are covered at all levels. The current + * implementation is likely to gather the same util statistics multiple times. + * This can probably be done in a faster but more complex way. + * Note: sched_group_energy() may fail when racing with sched_domain updates. */ -static void calc_sg_energy(struct energy_env *eenv) -{ - struct sched_group *sg = eenv->sg; - int busy_energy, idle_energy; - unsigned int busy_power; - unsigned int idle_power; - unsigned long sg_util; - int cap_idx, idle_idx; - int total_energy = 0; - int cpu_idx; - - for (cpu_idx = EAS_CPU_PRV; cpu_idx < EAS_CPU_CNT; ++cpu_idx) { - - - if (eenv->cpu[cpu_idx].cpu_id == -1) - continue; - /* Compute ACTIVE energy */ - cap_idx = find_new_capacity(eenv, cpu_idx); - busy_power = sg->sge->cap_states[cap_idx].power; - /* - * in order to calculate cpu_norm_util, we need to know which - * capacity level the group will be at, so calculate that first - */ - sg_util = group_norm_util(eenv, cpu_idx); - - busy_energy = sg_util * busy_power; - - /* Compute IDLE energy */ - idle_idx = group_idle_state(eenv, cpu_idx); - idle_power = sg->sge->idle_states[idle_idx].power; - - idle_energy = SCHED_LOAD_SCALE - sg_util; - idle_energy *= idle_power; - - total_energy = busy_energy + idle_energy; - eenv->cpu[cpu_idx].energy += total_energy; - } -} - -/* - * compute_energy() computes the absolute variation in energy consumption by - * moving eenv.util_delta from EAS_CPU_PRV to EAS_CPU_NXT. - * - * NOTE: compute_energy() may fail when racing with sched_domain updates, in - * which case we abort by returning -EINVAL. - */ -static int compute_energy(struct energy_env *eenv) +static int sched_group_energy(struct energy_env *eenv) { struct cpumask visit_cpus; + u64 total_energy = 0; int cpu_count; WARN_ON(!eenv->sg_top->sge); @@ -5685,18 +5530,41 @@ static int compute_energy(struct energy_env *eenv) break; do { - eenv->sg_cap = sg; + unsigned long group_util; + int sg_busy_energy, sg_idle_energy; + int cap_idx, idle_idx; + if (sg_shared_cap && sg_shared_cap->group_weight >= sg->group_weight) eenv->sg_cap = sg_shared_cap; + else + eenv->sg_cap = sg; - /* - * Compute the energy for all the candidate - * CPUs in the current visited SG. - */ - eenv->sg = sg; - calc_sg_energy(eenv); + cap_idx = find_new_capacity(eenv, sg->sge); + + if (sg->group_weight == 1) { + /* Remove capacity of src CPU (before task move) */ + if (eenv->trg_cpu == eenv->src_cpu && + cpumask_test_cpu(eenv->src_cpu, sched_group_cpus(sg))) { + eenv->cap.before = sg->sge->cap_states[cap_idx].cap; + eenv->cap.delta -= eenv->cap.before; + } + /* Add capacity of dst CPU (after task move) */ + if (eenv->trg_cpu == eenv->dst_cpu && + cpumask_test_cpu(eenv->dst_cpu, sched_group_cpus(sg))) { + eenv->cap.after = sg->sge->cap_states[cap_idx].cap; + eenv->cap.delta += eenv->cap.after; + } + } + + idle_idx = group_idle_state(eenv, sg); + group_util = group_norm_util(eenv, sg); + + sg_busy_energy = (group_util * sg->sge->cap_states[cap_idx].power); + sg_idle_energy = ((SCHED_LOAD_SCALE-group_util) + * sg->sge->idle_states[idle_idx].power); + + total_energy += sg_busy_energy + sg_idle_energy; - /* remove CPUs we have just visited */ if (!sd->child) { /* * cpu_count here is the number of @@ -5737,6 +5605,7 @@ static int compute_energy(struct energy_env *eenv) continue; } + eenv->energy = total_energy >> SCHED_CAPACITY_SHIFT; return 0; } @@ -5745,101 +5614,181 @@ static inline bool cpu_in_sg(struct sched_group *sg, int cpu) return cpu != -1 && cpumask_test_cpu(cpu, sched_group_cpus(sg)); } +static inline unsigned long task_util(struct task_struct *p); + /* - * select_energy_cpu_idx(): estimate the energy impact of changing the - * utilization distribution. - * - * The eenv parameter specifies the changes: utilisation amount and a pair of - * possible CPU candidates (the previous CPU and a different target CPU). - * - * This function returns the index of a CPU candidate specified by the - * energy_env which corresponds to the first CPU saving energy. - * Thus, 0 (EAS_CPU_PRV) means that non of the CPU candidate is more energy - * efficient than running on prev_cpu. This is also the value returned in case - * of abort due to error conditions during the computations. - * A value greater than zero means that the first energy-efficient CPU is the - * one represented by eenv->cpu[eenv->next_idx].cpu_id. + * energy_diff(): Estimate the energy impact of changing the utilization + * distribution. eenv specifies the change: utilisation amount, source, and + * destination cpu. Source or destination cpu may be -1 in which case the + * utilization is removed from or added to the system (e.g. task wake-up). If + * both are specified, the utilization is migrated. */ -static inline int select_energy_cpu_idx(struct energy_env *eenv) +static inline int __energy_diff(struct energy_env *eenv) { struct sched_domain *sd; struct sched_group *sg; - int sd_cpu = -1; - int cpu_idx; - int margin; + int sd_cpu = -1, energy_before = 0, energy_after = 0; + int diff, margin; + + struct energy_env eenv_before = { + .util_delta = task_util(eenv->task), + .src_cpu = eenv->src_cpu, + .dst_cpu = eenv->dst_cpu, + .trg_cpu = eenv->src_cpu, + .nrg = { 0, 0, 0, 0}, + .cap = { 0, 0, 0 }, + .task = eenv->task, + }; - sd_cpu = eenv->cpu[EAS_CPU_PRV].cpu_id; - sd = rcu_dereference(per_cpu(sd_ea, sd_cpu)); - if (!sd) - return EAS_CPU_PRV; + if (eenv->src_cpu == eenv->dst_cpu) + return 0; - cpumask_clear(&eenv->cpus_mask); - for (cpu_idx = EAS_CPU_PRV; cpu_idx < EAS_CPU_CNT; ++cpu_idx) { - int cpu = eenv->cpu[cpu_idx].cpu_id; + sd_cpu = (eenv->src_cpu != -1) ? eenv->src_cpu : eenv->dst_cpu; + sd = rcu_dereference(per_cpu(sd_ea, sd_cpu)); - if (cpu < 0) - continue; - cpumask_set_cpu(cpu, &eenv->cpus_mask); - } + if (!sd) + return 0; /* Error */ sg = sd->groups; do { - /* Skip SGs which do not contains a candidate CPU */ - if (!cpumask_intersects(&eenv->cpus_mask, sched_group_cpus(sg))) - continue; - eenv->sg_top = sg; - /* energy is unscaled to reduce rounding errors */ - if (compute_energy(eenv) == -EINVAL) - return EAS_CPU_PRV; + if (cpu_in_sg(sg, eenv->src_cpu) || cpu_in_sg(sg, eenv->dst_cpu)) { + eenv_before.sg_top = eenv->sg_top = sg; + + if (sched_group_energy(&eenv_before)) + return 0; /* Invalid result abort */ + energy_before += eenv_before.energy; + /* Keep track of SRC cpu (before) capacity */ + eenv->cap.before = eenv_before.cap.before; + eenv->cap.delta = eenv_before.cap.delta; + + if (sched_group_energy(eenv)) + return 0; /* Invalid result abort */ + energy_after += eenv->energy; + } } while (sg = sg->next, sg != sd->groups); - /* Scale energy before comparisons */ - for (cpu_idx = EAS_CPU_PRV; cpu_idx < EAS_CPU_CNT; ++cpu_idx) - eenv->cpu[cpu_idx].energy >>= SCHED_CAPACITY_SHIFT; + eenv->nrg.before = energy_before; + eenv->nrg.after = energy_after; + eenv->nrg.diff = eenv->nrg.after - eenv->nrg.before; + eenv->payoff = 0; +#ifndef CONFIG_SCHED_TUNE + trace_sched_energy_diff(eenv->task, + eenv->src_cpu, eenv->dst_cpu, eenv->util_delta, + eenv->nrg.before, eenv->nrg.after, eenv->nrg.diff, + eenv->cap.before, eenv->cap.after, eenv->cap.delta, + eenv->nrg.delta, eenv->payoff); +#endif /* - * Compute the dead-zone margin used to prevent too many task - * migrations with negligible energy savings. - * An energy saving is considered meaningful if it reduces the energy - * consumption of EAS_CPU_PRV CPU candidate by at least ~1.56% + * Dead-zone margin preventing too many migrations. */ - margin = eenv->cpu[EAS_CPU_PRV].energy >> 6; + margin = eenv->nrg.before >> 6; /* ~1.56% */ - /* - * By default the EAS_CPU_PRV CPU is considered the most energy - * efficient, with a 0 energy variation. - */ - eenv->next_idx = EAS_CPU_PRV; + diff = eenv->nrg.after - eenv->nrg.before; - /* - * Compare the other CPU candidates to find a CPU which can be - * more energy efficient then EAS_CPU_PRV - */ - for (cpu_idx = EAS_CPU_NXT; cpu_idx < EAS_CPU_CNT; ++cpu_idx) { - /* Skip not valid scheduled candidates */ - if (eenv->cpu[cpu_idx].cpu_id < 0) - continue; - /* Compute energy delta wrt EAS_CPU_PRV */ - eenv->cpu[cpu_idx].nrg_delta = - eenv->cpu[cpu_idx].energy - - eenv->cpu[EAS_CPU_PRV].energy; - /* filter energy variations within the dead-zone margin */ - if (abs(eenv->cpu[cpu_idx].nrg_delta) < margin) - eenv->cpu[cpu_idx].nrg_delta = 0; - /* update the schedule candidate with min(nrg_delta) */ - if (eenv->cpu[cpu_idx].nrg_delta < - eenv->cpu[eenv->next_idx].nrg_delta) { - eenv->next_idx = cpu_idx; - if (sched_feat(FBT_STRICT_ORDER)) - break; - } + eenv->nrg.diff = (abs(diff) < margin) ? 0 : eenv->nrg.diff; + + return eenv->nrg.diff; +} + +#ifdef CONFIG_SCHED_TUNE + +struct target_nrg schedtune_target_nrg; + +#ifdef CONFIG_CGROUP_SCHEDTUNE +extern bool schedtune_initialized; +#endif /* CONFIG_CGROUP_SCHEDTUNE */ + +/* + * System energy normalization + * Returns the normalized value, in the range [0..SCHED_CAPACITY_SCALE], + * corresponding to the specified energy variation. + */ +static inline int +normalize_energy(int energy_diff) +{ + u32 normalized_nrg; + +#ifdef CONFIG_CGROUP_SCHEDTUNE + /* during early setup, we don't know the extents */ + if (unlikely(!schedtune_initialized)) + return energy_diff < 0 ? -1 : 1 ; +#endif /* CONFIG_CGROUP_SCHEDTUNE */ + +#ifdef CONFIG_SCHED_DEBUG + { + int max_delta; + + /* Check for boundaries */ + max_delta = schedtune_target_nrg.max_power; + max_delta -= schedtune_target_nrg.min_power; + WARN_ON(abs(energy_diff) >= max_delta); } +#endif - return eenv->next_idx; + /* Do scaling using positive numbers to increase the range */ + normalized_nrg = (energy_diff < 0) ? -energy_diff : energy_diff; + + /* Scale by energy magnitude */ + normalized_nrg <<= SCHED_CAPACITY_SHIFT; + + /* Normalize on max energy for target platform */ + normalized_nrg = reciprocal_divide( + normalized_nrg, schedtune_target_nrg.rdiv); + + return (energy_diff < 0) ? -normalized_nrg : normalized_nrg; } +static inline int +energy_diff(struct energy_env *eenv) +{ + int boost = schedtune_task_boost(eenv->task); + int nrg_delta; + + /* Conpute "absolute" energy diff */ + __energy_diff(eenv); + + /* Return energy diff when boost margin is 0 */ + if (boost == 0) { + trace_sched_energy_diff(eenv->task, + eenv->src_cpu, eenv->dst_cpu, eenv->util_delta, + eenv->nrg.before, eenv->nrg.after, eenv->nrg.diff, + eenv->cap.before, eenv->cap.after, eenv->cap.delta, + 0, -eenv->nrg.diff); + return eenv->nrg.diff; + } + + /* Compute normalized energy diff */ + nrg_delta = normalize_energy(eenv->nrg.diff); + eenv->nrg.delta = nrg_delta; + + eenv->payoff = schedtune_accept_deltas( + eenv->nrg.delta, + eenv->cap.delta, + eenv->task); + + trace_sched_energy_diff(eenv->task, + eenv->src_cpu, eenv->dst_cpu, eenv->util_delta, + eenv->nrg.before, eenv->nrg.after, eenv->nrg.diff, + eenv->cap.before, eenv->cap.after, eenv->cap.delta, + eenv->nrg.delta, eenv->payoff); + + /* + * When SchedTune is enabled, the energy_diff() function will return + * the computed energy payoff value. Since the energy_diff() return + * value is expected to be negative by its callers, this evaluation + * function return a negative value each time the evaluation return a + * positive payoff, which is the condition for the acceptance of + * a scheduling decision + */ + return -eenv->payoff; +} +#else /* CONFIG_SCHED_TUNE */ +#define energy_diff(eenv) __energy_diff(eenv) +#endif + /* * Detect M:N waker/wakee relationships via a switching-frequency heuristic. * A waker of many should wake a different task than the one last awakened @@ -5945,9 +5894,8 @@ static inline unsigned long task_util(struct task_struct *p) return p->se.avg.util_avg; } -unsigned int capacity_margin = 1280; /* ~20% margin */ +static inline unsigned long boosted_task_util(struct task_struct *task); -static inline unsigned long boosted_task_util(struct task_struct *p); static inline bool __task_fits(struct task_struct *p, int cpu, int util) { unsigned long capacity = capacity_of(cpu); @@ -5961,33 +5909,16 @@ static inline bool task_fits_max(struct task_struct *p, int cpu) { unsigned long capacity = capacity_of(cpu); unsigned long max_capacity = cpu_rq(cpu)->rd->max_cpu_capacity.val; -#ifdef CONFIG_HISI_EAS_SCHED - unsigned long max_allowed_cap = 0; - int allowed_cpu; -#endif if (capacity == max_capacity) return true; -#ifdef CONFIG_HISI_EAS_SCHED - for_each_cpu(allowed_cpu, tsk_cpus_allowed(p)) { - if (capacity_orig_of(allowed_cpu) > max_allowed_cap) - max_allowed_cap = capacity_orig_of(allowed_cpu); - } - - /* allowed cpus is limited */ - if (max_allowed_cap <= capacity_orig_of(cpu)) + if (capacity * capacity_margin > max_capacity * 1024) return true; -#endif return __task_fits(p, cpu, 0); } -static inline bool task_fits_spare(struct task_struct *p, int cpu) -{ - return __task_fits(p, cpu, cpu_util(cpu)); -} - static bool __cpu_overutilized(int cpu, int delta) { return (capacity_of(cpu) * 1024) < ((cpu_util(cpu) + delta) * capacity_margin); @@ -5998,64 +5929,6 @@ static bool cpu_overutilized(int cpu) return __cpu_overutilized(cpu, 0); } -static bool cpu_halfutilized(int cpu) -{ - return capacity_of(cpu) < (cpu_util(cpu) * 2); -} - -static bool need_spread_task(int cpu) -{ - struct sched_domain *sd; - int i; - - sd = rcu_dereference_check_sched_domain(cpu_rq(cpu)->sd); - if (!sd) { - return false; - } - - for_each_cpu(i, sched_domain_span(sd)) { - if (cpu_util(i) * capacity_margin < capacity_orig_of(i) * 1024) { - return false; - } - } - - return true; -} - -static bool need_want_affine(struct task_struct *p, int cpu) -{ - int capacity = capacity_orig_of(cpu); - int max_capacity = cpu_rq(cpu)->rd->max_cpu_capacity.val; - unsigned long margin = schedtune_task_margin(p); - struct sched_domain *sd; - int affine = 0, i; - - if (margin) - return 1; - - if (capacity != max_capacity) - return 1; - - rcu_read_lock(); - - sd = rcu_dereference_check_sched_domain(cpu_rq(cpu)->sd); - if (!sd) { - rcu_read_unlock(); - return 1; - } - - for_each_cpu(i, sched_domain_span(sd)) { - if (idle_cpu(i) && __task_fits(p, i, cpu_util_wake(i, p))) { - affine = 1; - break; - } - } - - rcu_read_unlock(); - - return affine; -} - #ifdef CONFIG_SCHED_TUNE struct reciprocal_value schedtune_spc_rdiv; @@ -6076,11 +5949,11 @@ schedtune_margin(unsigned long signal, long boost) if (boost >= 0) { margin = SCHED_CAPACITY_SCALE - signal; margin *= boost; - } else { + } else margin = -signal * boost; - } margin = reciprocal_divide(margin, schedtune_spc_rdiv); + if (boost < 0) margin *= -1; @@ -6099,16 +5972,16 @@ schedtune_cpu_margin(unsigned long util, int cpu) } static inline long -schedtune_task_margin(struct task_struct *p) +schedtune_task_margin(struct task_struct *task) { - int boost = schedtune_task_boost(p); + int boost = schedtune_task_boost(task); unsigned long util; long margin; if (boost == 0) return 0; - util = task_util(p); + util = task_util(task); margin = schedtune_margin(util, boost); return margin; @@ -6122,8 +5995,8 @@ schedtune_cpu_margin(unsigned long util, int cpu) return 0; } -static inline long -schedtune_task_margin(struct task_struct *p) +static inline int +schedtune_task_margin(struct task_struct *task) { return 0; } @@ -6142,16 +6015,21 @@ boosted_cpu_util(int cpu) } static inline unsigned long -boosted_task_util(struct task_struct *p) +boosted_task_util(struct task_struct *task) { - unsigned long util = task_util(p); - long margin = schedtune_task_margin(p); + unsigned long util = task_util(task); + long margin = schedtune_task_margin(task); - trace_sched_boost_task(p, util, margin); + trace_sched_boost_task(task, util, margin); return util + margin; } +static unsigned long capacity_spare_wake(int cpu, struct task_struct *p) +{ + return capacity_orig_of(cpu) - cpu_util_wake(cpu, p); +} + /* * find_idlest_group finds and returns the least busy CPU group within the * domain. @@ -6163,10 +6041,9 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p, int this_cpu, int sd_flag) { struct sched_group *idlest = NULL, *group = sd->groups; - struct sched_group *fit_group = NULL, *spare_group = NULL; + struct sched_group *most_spare_sg = NULL; unsigned long min_load = ULONG_MAX, this_load = ULONG_MAX; - unsigned long fit_capacity = ULONG_MAX; - unsigned long max_spare_capacity = capacity_margin - SCHED_LOAD_SCALE; + unsigned long most_spare = 0, this_spare = 0; int load_idx = sd->forkexec_idx; int imbalance = 100 + (sd->imbalance_pct-100)/2; @@ -6174,7 +6051,7 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p, load_idx = sd->wake_idx; do { - unsigned long load, avg_load, spare_capacity; + unsigned long load, avg_load, spare_cap, max_spare_cap; int local_group; int i; @@ -6186,8 +6063,12 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p, local_group = cpumask_test_cpu(this_cpu, sched_group_cpus(group)); - /* Tally up the load of all CPUs in the group */ + /* + * Tally up the load of all CPUs in the group and find + * the group containing the CPU with most spare capacity. + */ avg_load = 0; + max_spare_cap = 0; for_each_cpu(i, sched_group_cpus(group)) { /* Bias balancing toward cpus of our domain */ @@ -6198,24 +6079,10 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p, avg_load += load; - /* - * Look for most energy-efficient group that can fit - * that can fit the task. - */ - if (capacity_of(i) < fit_capacity && __task_fits(p, i, cpu_util_wake(i, p))) { - fit_capacity = capacity_of(i); - fit_group = group; - } + spare_cap = capacity_spare_wake(i, p); - /* - * Look for group which has most spare capacity on a - * single cpu. - */ - spare_capacity = capacity_of(i) - cpu_util_wake(i, p); - if (spare_capacity > max_spare_capacity) { - max_spare_capacity = spare_capacity; - spare_group = group; - } + if (spare_cap > max_spare_cap) + max_spare_cap = spare_cap; } /* Adjust by relative CPU capacity of the group */ @@ -6223,172 +6090,46 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p, if (local_group) { this_load = avg_load; - } else if (avg_load < min_load) { - min_load = avg_load; - idlest = group; - } - } while (group = group->next, group != sd->groups); - - if (fit_group && (sd->flags & SD_ASYM_CPUCAPACITY)) - return fit_group; - - if (spare_group && !(sd->flags & SD_ASYM_CPUCAPACITY)) - return spare_group; - - if (!idlest || 100*this_load < imbalance*min_load) - return NULL; - return idlest; -} - -#ifdef CONFIG_HISI_EAS_SCHED -static unsigned long cpu_spare_capacity(int cpu, unsigned long util) -{ - unsigned long spare_capacity; - spare_capacity = capacity_of(cpu) - util; - spare_capacity = clamp(spare_capacity, 0UL, capacity_of(cpu)); - - return spare_capacity; -} - -static int -find_spare_boost_cpu(struct cpumask *group_cpus, struct task_struct *p) -{ - int spare_boost_cpu = -1; - unsigned long max_spare_capacity = 0; - unsigned long spare_capacity; - int i; - int spare_idle_cpu = -1; - unsigned long max_idle_cap = 0; - unsigned long wake_util; - - for_each_cpu_and(i, group_cpus, tsk_cpus_allowed(p)) { - /* If the CPU's utilizaiton is over 60%, - * then we don't consider the cpu as spare one. - */ - wake_util = cpu_util_wake(i, p); - if (!__task_fits(p, i, wake_util)) - continue; - - spare_capacity = cpu_spare_capacity(i, wake_util); - if (idle_cpu(i)) { - if (spare_idle_cpu != i && spare_capacity > max_idle_cap) { - spare_idle_cpu = i; - max_idle_cap = spare_capacity; - } + this_spare = max_spare_cap; } else { - if (spare_capacity > max_spare_capacity) { - max_spare_capacity = spare_capacity; - spare_boost_cpu = i; + if (avg_load < min_load) { + min_load = avg_load; + idlest = group; } - } - } - - spare_boost_cpu = (spare_idle_cpu != -1) ? spare_idle_cpu : spare_boost_cpu; - return spare_boost_cpu; -} - -static int select_boost_cpu(struct task_struct *p, int spare_cpu, int boost_cpu) -{ - unsigned long cap_boost_cpu, cap_spare_cpu; - - cap_boost_cpu = cpu_spare_capacity(boost_cpu, cpu_util_wake(boost_cpu, p)); - cap_spare_cpu = cpu_spare_capacity(spare_cpu, cpu_util_wake(spare_cpu, p)); - - /* select the cpu with max spare cap */ - if (cap_boost_cpu < cap_spare_cpu) - boost_cpu = spare_cpu; - - return boost_cpu; -} - -/* - * find_boost_cpu - find the idlest cpu among the fast_cpus. - */ -static int -find_boost_cpu(struct cpumask *group_cpus, struct task_struct *p, int this_cpu) -{ - unsigned long load, min_load = ULONG_MAX; - unsigned int min_exit_latency = UINT_MAX; - u64 latest_idle_timestamp = 0; - int least_loaded_cpu = this_cpu; - int shallowest_idle_cpu = -1; - int i; - - /* Traverse only the allowed CPUs */ - for_each_cpu_and(i, group_cpus, tsk_cpus_allowed(p)) { - if (!cpumask_test_cpu(i, cpu_online_mask)) - continue; - - if (idle_cpu(i)) { - struct rq *rq = cpu_rq(i); - struct cpuidle_state *idle = idle_get_state(rq); - if (idle && idle->exit_latency < min_exit_latency) { - /* - * We give priority to a CPU whose idle state - * has the smallest exit latency irrespective - * of any idle timestamp. - */ - min_exit_latency = idle->exit_latency; - latest_idle_timestamp = rq->idle_stamp; - shallowest_idle_cpu = i; - } else if ((!idle || idle->exit_latency == min_exit_latency) && - rq->idle_stamp > latest_idle_timestamp) { - /* - * If equal or no active idle state, then - * the most recently idled CPU might have - * a warmer cache. - */ - latest_idle_timestamp = rq->idle_stamp; - shallowest_idle_cpu = i; - } - } else if (shallowest_idle_cpu == -1) { - load = weighted_cpuload(i); - if (load < min_load || (load == min_load && i == this_cpu)) { - min_load = load; - least_loaded_cpu = i; + if (most_spare < max_spare_cap) { + most_spare = max_spare_cap; + most_spare_sg = group; } } - } - - return shallowest_idle_cpu != -1 ? shallowest_idle_cpu : least_loaded_cpu; -} - -static int -find_global_boost_cpu(struct task_struct *p) -{ - struct cpumask fast_cpus; - struct cpumask spare_cpus; - int boost_cpu = -1; - int spare_cpu = -1; - hisi_get_fast_cpus(&fast_cpus); - - if (cpumask_empty(&fast_cpus) || !cpumask_intersects(tsk_cpus_allowed(p), &fast_cpus) - || !cpumask_intersects(&fast_cpus, cpu_online_mask)) - return -1; - - boost_cpu = find_boost_cpu(&fast_cpus, p, cpumask_first(&fast_cpus)); - if (boost_cpu != -1) { - if (idle_cpu(boost_cpu)) - return boost_cpu; - - /* Enable spare boost cpu feature */ - /* If util of boost_cpu is over 90%, check if any spare cpu is available.*/ - if ((capacity_of(boost_cpu) * 1024) < (cpu_util_wake(boost_cpu, p) * 1138)) { - cpumask_xor(&spare_cpus, &fast_cpus, cpu_online_mask); - spare_cpu = find_spare_boost_cpu(&spare_cpus, p); + } while (group = group->next, group != sd->groups); - /* if spare_cpu available, select max spare one . */ - if (spare_cpu != -1) - boost_cpu= select_boost_cpu(p, spare_cpu, boost_cpu); + /* + * The cross-over point between using spare capacity or least load + * is too conservative for high utilization tasks on partially + * utilized systems if we require spare_capacity > task_util(p), + * so we allow for some task stuffing by using + * spare_capacity > task_util(p)/2. + * + * Spare capacity can't be used for fork because the utilization has + * not been set yet, we must first select a rq to compute the initial + * utilization. + */ + if (sd_flag & SD_BALANCE_FORK) + goto skip_spare; - } - } + if (this_spare > task_util(p) / 2 && + imbalance*this_spare > 100*most_spare) + return NULL; + else if (most_spare > task_util(p) / 2) + return most_spare_sg; - return boost_cpu; +skip_spare: + if (!idlest || 100*this_load < imbalance*min_load) + return NULL; + return idlest; } -#endif /* * find_idlest_group_cpu - find the idlest cpu among the cpus in group. @@ -6403,9 +6144,13 @@ find_idlest_group_cpu(struct sched_group *group, struct task_struct *p, int this int shallowest_idle_cpu = -1; int i; + /* Check if we have any choice: */ + if (group->group_weight == 1) + return cpumask_first(sched_group_cpus(group)); + /* Traverse only the allowed CPUs */ for_each_cpu_and(i, sched_group_cpus(group), tsk_cpus_allowed(p)) { - if (task_fits_spare(p, i)) { + if (idle_cpu(i)) { struct rq *rq = cpu_rq(i); struct cpuidle_state *idle = idle_get_state(rq); if (idle && idle->exit_latency < min_exit_latency) { @@ -6417,8 +6162,7 @@ find_idlest_group_cpu(struct sched_group *group, struct task_struct *p, int this min_exit_latency = idle->exit_latency; latest_idle_timestamp = rq->idle_stamp; shallowest_idle_cpu = i; - } else if (idle_cpu(i) && - (!idle || idle->exit_latency == min_exit_latency) && + } else if ((!idle || idle->exit_latency == min_exit_latency) && rq->idle_stamp > latest_idle_timestamp) { /* * If equal or no active idle state, then @@ -6427,14 +6171,6 @@ find_idlest_group_cpu(struct sched_group *group, struct task_struct *p, int this */ latest_idle_timestamp = rq->idle_stamp; shallowest_idle_cpu = i; - } else if (shallowest_idle_cpu == -1) { - /* - * If we haven't found an idle CPU yet - * pick a non-idle one that can fit the task as - * fallback. - */ - shallowest_idle_cpu = i; - } } else if (shallowest_idle_cpu == -1) { load = weighted_cpuload(i); if (load < min_load || (load == min_load && i == this_cpu)) { @@ -6635,31 +6371,27 @@ static int cpu_util_wake(int cpu, struct task_struct *p) return (util >= capacity) ? capacity : util; } -static int start_cpu(bool prefer_idle) +static int start_cpu(bool boosted) { struct root_domain *rd = cpu_rq(smp_processor_id())->rd; - return prefer_idle ? rd->max_cap_orig_cpu : rd->min_cap_orig_cpu; + return boosted ? rd->max_cap_orig_cpu : rd->min_cap_orig_cpu; } static inline int find_best_target(struct task_struct *p, int *backup_cpu, - bool prefer_idle) + bool boosted, bool prefer_idle) { - unsigned long high_cpu_util = SCHED_CAPACITY_SCALE; - unsigned long task_util_boosted = boosted_task_util(p); + unsigned long best_idle_min_cap_orig = ULONG_MAX; + unsigned long min_util = boosted_task_util(p); unsigned long target_capacity = ULONG_MAX; unsigned long min_wake_util = ULONG_MAX; unsigned long target_max_spare_cap = 0; unsigned long best_active_util = ULONG_MAX; - unsigned long target_idle_max_spare_cap = 0; - unsigned long target_max_free_util = 0; int best_idle_cstate = INT_MAX; - bool low_util_mode = true; struct sched_domain *sd; struct sched_group *sg; int best_active_cpu = -1; int best_idle_cpu = -1; - int low_util_cpu = -1; int target_cpu = -1; int cpu, i; @@ -6668,8 +6400,8 @@ static inline int find_best_target(struct task_struct *p, int *backup_cpu, schedstat_inc(p, se.statistics.nr_wakeups_fbt_attempts); schedstat_inc(this_rq(), eas_stats.fbt_attempts); - /* Find start CPU based on prefer_idle flag*/ - cpu = start_cpu(prefer_idle); + /* Find start CPU based on boost value */ + cpu = start_cpu(boosted); if (cpu < 0) { schedstat_inc(p, se.statistics.nr_wakeups_fbt_no_cpu); schedstat_inc(this_rq(), eas_stats.fbt_no_cpu); @@ -6684,12 +6416,6 @@ static inline int find_best_target(struct task_struct *p, int *backup_cpu, return -1; } - /* - * Consider a CPU highly utilized when it's utilization is bigger than - * 1/4 of the maximum capacity. - */ - high_cpu_util = SCHED_CAPACITY_SCALE >> 2; - /* Scan CPUs in all SDs */ sg = sd->groups; do { From b9760947f9f64cc15e78c97dddaacac8ab7307fb Mon Sep 17 00:00:00 2001 From: UchihaDev Date: Wed, 2 May 2018 18:56:55 +0545 Subject: [PATCH 134/146] sched/fair: rebase aosp (all aosp) --- kernel/sched/fair.c | 836 ++++++++------------------------------------ 1 file changed, 143 insertions(+), 693 deletions(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 6fe74de98..cb9063454 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -42,7 +42,6 @@ #include #endif - /* * Targeted preemption latency for CPU-bound tasks: * (default: 6ms * (1 + ilog(ncpus)), units: nanoseconds) @@ -55,13 +54,8 @@ * (to see the precise effective timeslice length of your workload, * run vmstat and monitor the context-switches (cs) field) */ -#ifdef CONFIG_ZEN_INTERACTIVE -unsigned int sysctl_sched_latency = 3000000ULL; -unsigned int normalized_sysctl_sched_latency = 3000000ULL; -#else unsigned int sysctl_sched_latency = 6000000ULL; unsigned int normalized_sysctl_sched_latency = 6000000ULL; -#endif unsigned int sysctl_sched_sync_hint_enable = 1; unsigned int sysctl_sched_cstate_aware = 1; @@ -88,22 +82,13 @@ enum sched_tunable_scaling sysctl_sched_tunable_scaling * Minimal preemption granularity for CPU-bound tasks: * (default: 0.75 msec * (1 + ilog(ncpus)), units: nanoseconds) */ -#ifdef CONFIG_ZEN_INTERACTIVE -unsigned int sysctl_sched_min_granularity = 300000ULL; -unsigned int normalized_sysctl_sched_min_granularity = 300000ULL; -#else unsigned int sysctl_sched_min_granularity = 750000ULL; unsigned int normalized_sysctl_sched_min_granularity = 750000ULL; -#endif /* * is kept at sysctl_sched_latency / sysctl_sched_min_granularity */ -#ifdef CONFIG_ZEN_INTERACTIVE -static unsigned int sched_nr_latency = 10; -#else static unsigned int sched_nr_latency = 8; -#endif /* * After fork, child runs first. If set to 0 (default) then @@ -119,17 +104,10 @@ unsigned int sysctl_sched_child_runs_first __read_mostly; * and reduces their over-scheduling. Synchronous workloads will still * have immediate wakeup/sleep latencies. */ -#ifdef CONFIG_ZEN_INTERACTIVE -unsigned int sysctl_sched_wakeup_granularity = 500000UL; -unsigned int normalized_sysctl_sched_wakeup_granularity = 500000UL; - -const_debug unsigned int sysctl_sched_migration_cost = 250000UL; -#else unsigned int sysctl_sched_wakeup_granularity = 1000000UL; unsigned int normalized_sysctl_sched_wakeup_granularity = 1000000UL; const_debug unsigned int sysctl_sched_migration_cost = 500000UL; -#endif /* * The exponential sliding window over which load is averaged for shares @@ -149,12 +127,8 @@ unsigned int __read_mostly sysctl_sched_shares_window = 10000000UL; * * default: 5 msec, units: microseconds */ -#ifdef CONFIG_ZEN_INTERACTIVE -unsigned int sysctl_sched_cfs_bandwidth_slice = 3000UL; -#else unsigned int sysctl_sched_cfs_bandwidth_slice = 5000UL; #endif -#endif /* * The margin used when comparing utilization with CPU capacity: @@ -3612,7 +3586,6 @@ static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq); static void dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) { - /* * Update run-time statistics of the 'current'. */ @@ -4690,7 +4663,6 @@ static inline void hrtick_update(struct rq *rq) #endif #ifdef CONFIG_SMP - static bool __cpu_overutilized(int cpu, int delta); static bool cpu_overutilized(int cpu); unsigned long boosted_cpu_util(int cpu); @@ -4710,7 +4682,6 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags) struct sched_entity *se = &p->se; #ifdef CONFIG_SMP int task_new = flags & ENQUEUE_WAKEUP_NEW; - #endif /* @@ -4760,6 +4731,7 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags) add_nr_running(rq, 1); #ifdef CONFIG_SMP + /* * Update SchedTune accounting. * @@ -4839,7 +4811,6 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags) #endif for_each_sched_entity(se) { - cfs_rq = cfs_rq_of(se); cfs_rq->h_nr_running--; walt_dec_cfs_cumulative_runnable_avg(cfs_rq, p); @@ -4855,6 +4826,7 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags) sub_nr_running(rq, 1); #ifdef CONFIG_SMP + /* * Update SchedTune accounting * @@ -5269,18 +5241,15 @@ static long effective_load(struct task_group *tg, int cpu, long wl, long wg) /* * Returns the current capacity of cpu after applying both - * cpu and min freq scaling. + * cpu and freq scaling. */ -unsigned long capacity_min_of(int cpu) +unsigned long capacity_curr_of(int cpu) { - if (!sched_feat(MIN_CAPACITY_CAPPING)) - return 0; - return arch_scale_cpu_capacity(NULL, cpu) * - arch_scale_min_freq_capacity(NULL, cpu) + return cpu_rq(cpu)->cpu_capacity_orig * + arch_scale_freq_capacity(NULL, cpu) >> SCHED_CAPACITY_SHIFT; } - static inline bool energy_aware(void) { return sched_feat(ENERGY_AWARE); @@ -5353,7 +5322,6 @@ static unsigned long group_max_util(struct energy_env *eenv) util += eenv->util_delta; max_util = max(max_util, util); - } return max_util; @@ -5519,6 +5487,7 @@ static int sched_group_energy(struct energy_env *eenv) * when we took visit_cpus. */ sd = rcu_dereference(per_cpu(sd_scs, cpu)); + if (sd && sd->parent) sg_shared_cap = sd->parent->groups; @@ -5650,8 +5619,8 @@ static inline int __energy_diff(struct energy_env *eenv) return 0; /* Error */ sg = sd->groups; - do { + do { if (cpu_in_sg(sg, eenv->src_cpu) || cpu_in_sg(sg, eenv->dst_cpu)) { eenv_before.sg_top = eenv->sg_top = sg; @@ -5680,10 +5649,10 @@ static inline int __energy_diff(struct energy_env *eenv) eenv->cap.before, eenv->cap.after, eenv->cap.delta, eenv->nrg.delta, eenv->payoff); #endif - /* * Dead-zone margin preventing too many migrations. */ + margin = eenv->nrg.before >> 6; /* ~1.56% */ diff = eenv->nrg.after - eenv->nrg.before; @@ -5956,7 +5925,6 @@ schedtune_margin(unsigned long signal, long boost) if (boost < 0) margin *= -1; - return margin; } @@ -6102,7 +6070,6 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p, most_spare_sg = group; } } - } while (group = group->next, group != sd->groups); /* @@ -6171,6 +6138,7 @@ find_idlest_group_cpu(struct sched_group *group, struct task_struct *p, int this */ latest_idle_timestamp = rq->idle_stamp; shallowest_idle_cpu = i; + } } else if (shallowest_idle_cpu == -1) { load = weighted_cpuload(i); if (load < min_load || (load == min_load && i == this_cpu)) { @@ -6284,7 +6252,6 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target) sg = sd->groups; do { int i; - if (!cpumask_intersects(sched_group_cpus(sg), tsk_cpus_allowed(p))) goto next; @@ -6422,47 +6389,28 @@ static inline int find_best_target(struct task_struct *p, int *backup_cpu, for_each_cpu_and(i, tsk_cpus_allowed(p), sched_group_cpus(sg)) { unsigned long capacity_curr = capacity_curr_of(i); unsigned long capacity_orig = capacity_orig_of(i); - unsigned long wake_util, new_util, min_capped_util; + unsigned long wake_util, new_util; if (!cpu_online(i)) continue; + if (walt_cpu_high_irqload(i)) + continue; + /* * p's blocked utilization is still accounted for on prev_cpu * so prev_cpu will receive a negative bias due to the double * accounting. However, the blocked utilization may be zero. */ wake_util = cpu_util_wake(i, p); - - /* - * Keep track of overall system utilization. - * System is considered in low-utilization if the - * utilization of each (online) CPU is below a - */ - if (wake_util + task_util_boosted >= high_cpu_util) - low_util_mode = false; - - /* Skip high IRQ loaded CPUs */ - if (walt_cpu_high_irqload(i)) - continue; + new_util = wake_util + task_util(p); /* * Ensure minimum capacity to grant the required boost. * The target CPU can be already at a capacity level higher * than the one required to boost the task. */ - new_util = wake_util + task_util(p); - new_util = max(task_util_boosted, new_util); - - /* - * Include minimum capacity constraint: - * new_util contains the required utilization including - * boost. min_capped_util also takes into account a - * minimum capacity cap imposed on the CPU by external - * actors. - */ - min_capped_util = max(new_util, capacity_min_of(i)); - + new_util = max(min_util, new_util); if (new_util > capacity_orig) continue; @@ -6495,57 +6443,23 @@ static inline int find_best_target(struct task_struct *p, int *backup_cpu, * tasks. */ if (prefer_idle) { + /* * Case A.1: IDLE CPU - * - * This heuristics will return: - * - the first IDLE CPU we find, if the system - * is !low_util, i.e. there is one CPU whith - * at least high_cpu_util utilization. - * - the most energy_efficient IDLE CPU, if the - * system is low_util, i.e. all the CPUs - * have less then high_cpu_util utilization. + * Return the first IDLE CPU we find. */ if (idle_cpu(i)) { + schedstat_inc(p, se.statistics.nr_wakeups_fbt_pref_idle); + schedstat_inc(this_rq(), eas_stats.fbt_pref_idle); - /* - * Keep track of first IDLE CPU and - * return that one if the system - * is not int low_util mode - */ - if (target_cpu == -1) { - target_capacity = capacity_orig; - target_cpu = i; - if (!low_util_mode) - break; - continue; - } + trace_sched_find_best_target(p, + prefer_idle, min_util, + cpu, best_idle_cpu, + best_active_cpu, i); - /* - * Unconditionally favour first IDLE - * CPU encountered on highly utilized - * systems. - */ - if (!low_util_mode) - break; - - /* Favor CPUs with smaller capacity */ - if (capacity_orig >= target_capacity) - continue; - - target_capacity = capacity_orig; - low_util_cpu = i; - continue; + return i; } - /* - * Restrict search on idle CPUs if we already - * found at least one. - */ - if (target_cpu >= 0) { - best_active_cpu = -1; - continue; - } /* * Case A.2: Target ACTIVE CPU * Favor CPUs with max spare capacity. @@ -6574,7 +6488,7 @@ static inline int find_best_target(struct task_struct *p, int *backup_cpu, best_active_util = new_util; best_active_cpu = i; continue; - } + } /* * Enforce EAS mode @@ -6589,13 +6503,6 @@ static inline int find_best_target(struct task_struct *p, int *backup_cpu, (capacity_orig * SCHED_CAPACITY_SCALE)) continue; - /* - * Favor CPUs with smaller capacity for Non latency - * sensitive tasks. - */ - if (capacity_orig > target_capacity) - continue; - /* * Case B) Non latency sensitive tasks on IDLE CPUs. * @@ -6623,11 +6530,8 @@ static inline int find_best_target(struct task_struct *p, int *backup_cpu, if (idle_cpu(i)) { int idle_idx = idle_get_state_idx(cpu_rq(i)); - /* Favor CPUs that won't end up running at a - * high OPP. - */ - if ((capacity_orig - min_capped_util) < - target_idle_max_spare_cap) + /* Select idle CPU with lower cap_orig */ + if (capacity_orig > best_idle_min_cap_orig) continue; /* @@ -6641,9 +6545,7 @@ static inline int find_best_target(struct task_struct *p, int *backup_cpu, continue; /* Keep track of best idle CPU */ - target_capacity = capacity_orig; - target_idle_max_spare_cap = capacity_orig - - min_capped_util; + best_idle_min_cap_orig = capacity_orig; best_idle_cstate = idle_idx; best_idle_cpu = i; continue; @@ -6665,72 +6567,55 @@ static inline int find_best_target(struct task_struct *p, int *backup_cpu, * that CPU at an higher OPP. * * Thus, this case keep track of the CPU with the - * smallest maximum capacity, highest spare maximum - * capacity and highest free cpu utility. + * smallest maximum capacity and highest spare maximum + * capacity. */ - /* Favor CPUs with maximum spare capacity */ - if ((capacity_orig - min_capped_util) < - target_max_spare_cap) + /* Favor CPUs with smaller capacity */ + if (capacity_orig > target_capacity) continue; - /* Favor CPUs with maximum free utilization */ - if ((capacity_orig - cpu_util(i)) < target_max_free_util) + /* Favor CPUs with maximum spare capacity */ + if ((capacity_orig - new_util) < target_max_spare_cap) continue; - target_max_spare_cap = capacity_orig - min_capped_util; + target_max_spare_cap = capacity_orig - new_util; target_capacity = capacity_orig; - target_max_free_util = capacity_orig - cpu_util(i); target_cpu = i; - } + } } while (sg = sg->next, sg != sd->groups); /* - * For latency sensitive tasks, case A in the previous loop, we pick - * the best ACTIVE CPU only if we was not able to find a target IDLE - * CPU. - * The target IDLE CPU is selected depending on CPUs utilization. - * In !low_util_mode we always pick the first IDLE candidate - * encountered. Otherwise, for low utilized systems, the most energy - * efficient IDLE CPU is preferred. - */ - if (prefer_idle && !low_util_mode) { - if (target_cpu == -1) - target_cpu = best_active_cpu; - schedstat_inc(p, se.statistics.nr_wakeups_fbt_pref_idle); - schedstat_inc(this_rq(), eas_stats.fbt_pref_idle); - goto done; - } - - if (prefer_idle && low_util_mode) { - if (low_util_cpu != -1) - target_cpu = low_util_cpu; - schedstat_inc(p, se.statistics.nr_wakeups_fbt_pref_idle_lum); - schedstat_inc(this_rq(), eas_stats.fbt_pref_idle_lum); - goto done; - } - - /* - * For non latency sensitive tasks, cases B and C in the previous - * loop, we pick the best IDLE CPU only if we was not able to find a - * target ACTIVE CPU. - * Otherwise, the best IDLE CPU becomes our backup choice. + * For non latency sensitive tasks, cases B and C in the previous loop, + * we pick the best IDLE CPU only if we was not able to find a target + * ACTIVE CPU. + * + * Policies priorities: + * + * - prefer_idle tasks: + * + * a) IDLE CPU available, we return immediately + * b) ACTIVE CPU where task fits and has the bigger maximum spare + * capacity (i.e. target_cpu) + * c) ACTIVE CPU with less contention due to other tasks + * (i.e. best_active_cpu) + * + * - NON prefer_idle tasks: + * + * a) ACTIVE CPU: target_cpu + * b) IDLE CPU: best_idle_cpu */ - if (target_cpu == -1) { - target_cpu = best_idle_cpu; - schedstat_inc(p, se.statistics.nr_wakeups_fbt_best_idle); - schedstat_inc(this_rq(), eas_stats.fbt_best_idle); - goto done; - } - *backup_cpu = best_idle_cpu; - schedstat_inc(p, se.statistics.nr_wakeups_fbt_best_active); - schedstat_inc(this_rq(), eas_stats.fbt_best_active); - -done: + if (target_cpu == -1) + target_cpu = prefer_idle + ? best_active_cpu + : best_idle_cpu; + else + *backup_cpu = prefer_idle + ? best_active_cpu + : best_idle_cpu; - trace_sched_find_best_target(p, prefer_idle, task_util_boosted, cpu, - low_util_mode, low_util_cpu, + trace_sched_find_best_target(p, prefer_idle, min_util, cpu, best_idle_cpu, best_active_cpu, target_cpu); @@ -6750,34 +6635,25 @@ static inline int find_best_target(struct task_struct *p, int *backup_cpu, static int wake_cap(struct task_struct *p, int cpu, int prev_cpu) { long min_cap, max_cap; -#ifdef CONFIG_CGROUP_SCHEDTUNE - bool boosted = schedtune_task_boost(p) > 0; - bool prefer_idle = schedtune_prefer_idle(p) > 0; -#else - bool boosted = 0; - bool prefer_idle = 0; -#endif min_cap = min(capacity_orig_of(prev_cpu), capacity_orig_of(cpu)); max_cap = cpu_rq(cpu)->rd->max_cpu_capacity.val; - /* Bring task utilization in sync with prev_cpu */ - if (!boosted && !prefer_idle) - sync_entity_load_avg(&p->se); - /* Minimum capacity is close to max, no need to abort wake_affine */ if (max_cap - min_cap < max_cap >> 3) return 0; + /* Bring task utilization in sync with prev_cpu */ + sync_entity_load_avg(&p->se); + return min_cap * 1024 < task_util(p) * capacity_margin; } static int select_energy_cpu_brute(struct task_struct *p, int prev_cpu, int sync) { + struct sched_domain *sd; + int target_cpu = prev_cpu, tmp_target, tmp_backup; bool boosted, prefer_idle; - int target_cpu; - int backup_cpu; - int next_cpu; schedstat_inc(p, se.statistics.nr_wakeups_secb_attempts); schedstat_inc(this_rq(), eas_stats.secb_attempts); @@ -6792,6 +6668,7 @@ static int select_energy_cpu_brute(struct task_struct *p, int prev_cpu, int sync } } + rcu_read_lock(); #ifdef CONFIG_CGROUP_SCHEDTUNE boosted = schedtune_task_boost(p) > 0; prefer_idle = schedtune_prefer_idle(p) > 0; @@ -6802,41 +6679,32 @@ static int select_energy_cpu_brute(struct task_struct *p, int prev_cpu, int sync sync_entity_load_avg(&p->se); + sd = rcu_dereference(per_cpu(sd_ea, prev_cpu)); /* Find a cpu with sufficient capacity */ - next_cpu = find_best_target(p, &backup_cpu, prefer_idle); - if (next_cpu == -1) { - target_cpu = prev_cpu; - return target_cpu; - } + tmp_target = find_best_target(p, &tmp_backup, boosted, prefer_idle); - /* Unconditionally prefer IDLE CPUs for boosted/prefer_idle tasks */ - if ((boosted || prefer_idle) && idle_cpu(next_cpu)) { - schedstat_inc(p, se.statistics.nr_wakeups_secb_idle_bt); - schedstat_inc(this_rq(), eas_stats.secb_idle_bt); - target_cpu = next_cpu; - return target_cpu; + if (!sd) + goto unlock; + if (tmp_target >= 0) { + target_cpu = tmp_target; + if ((boosted || prefer_idle) && idle_cpu(target_cpu)) { + schedstat_inc(p, se.statistics.nr_wakeups_secb_idle_bt); + schedstat_inc(this_rq(), eas_stats.secb_idle_bt); + goto unlock; + } } - target_cpu = prev_cpu; - if (next_cpu != prev_cpu) { + if (target_cpu != prev_cpu) { int delta = 0; struct energy_env eenv = { - .p = p, .util_delta = task_util(p), - /* Task's previous CPU candidate */ - .cpu[EAS_CPU_PRV] = { - .cpu_id = prev_cpu, - }, - /* Main alternative CPU candidate */ - .cpu[EAS_CPU_NXT] = { - .cpu_id = next_cpu, - }, - /* Backup alternative CPU candidate */ - .cpu[EAS_CPU_BKP] = { - .cpu_id = backup_cpu, - }, + .src_cpu = prev_cpu, + .dst_cpu = target_cpu, + .task = p, + .trg_cpu = target_cpu, }; + #ifdef CONFIG_SCHED_WALT if (!walt_disabled && sysctl_sched_use_walt_cpu_util && p->state == TASK_WAKING) @@ -6846,27 +6714,35 @@ static int select_energy_cpu_brute(struct task_struct *p, int prev_cpu, int sync if (__cpu_overutilized(prev_cpu, delta)) { schedstat_inc(p, se.statistics.nr_wakeups_secb_insuff_cap); schedstat_inc(this_rq(), eas_stats.secb_insuff_cap); - target_cpu = next_cpu; - return target_cpu; + goto unlock; } - /* Check if EAS_CPU_NXT is a more energy efficient CPU */ - if (select_energy_cpu_idx(&eenv) != EAS_CPU_PRV) { - schedstat_inc(p, se.statistics.nr_wakeups_secb_nrg_sav); - schedstat_inc(this_rq(), eas_stats.secb_nrg_sav); - target_cpu = eenv.cpu[eenv.next_idx].cpu_id; - return target_cpu; + if (energy_diff(&eenv) >= 0) { + /* No energy saving for target_cpu, try backup */ + target_cpu = tmp_backup; + eenv.dst_cpu = target_cpu; + eenv.trg_cpu = target_cpu; + if (tmp_backup < 0 || + tmp_backup == prev_cpu || + energy_diff(&eenv) >= 0) { + schedstat_inc(p, se.statistics.nr_wakeups_secb_no_nrg_sav); + schedstat_inc(this_rq(), eas_stats.secb_no_nrg_sav); + target_cpu = prev_cpu; + goto unlock; + } } - schedstat_inc(p, se.statistics.nr_wakeups_secb_no_nrg_sav); - schedstat_inc(this_rq(), eas_stats.secb_no_nrg_sav); - target_cpu = prev_cpu; - return target_cpu; + schedstat_inc(p, se.statistics.nr_wakeups_secb_nrg_sav); + schedstat_inc(this_rq(), eas_stats.secb_nrg_sav); + goto unlock; } schedstat_inc(p, se.statistics.nr_wakeups_secb_count); schedstat_inc(this_rq(), eas_stats.secb_count); +unlock: + rcu_read_unlock(); + return target_cpu; } @@ -6892,9 +6768,6 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f int want_affine = 0; int sync = wake_flags & WF_SYNC; - if (p->nr_cpus_allowed == 1) - return prev_cpu; - if (sd_flag & SD_BALANCE_WAKE) { record_wakee(p); want_affine = !wake_wide(p, sibling_count_hint) && @@ -6932,16 +6805,6 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f new_cpu = cpu; } -#ifdef CONFIG_HISI_EAS_SCHED - if (boot_boost || (global_boost_enabled_flag && (schedtune_task_boost(p) > 0))) { - int boost_cpu = find_global_boost_cpu(p); - if (boost_cpu != -1) { - rcu_read_unlock(); - return boost_cpu; - } - } -#endif - if (sd && !(sd_flag & SD_BALANCE_FORK)) { /* * We're going to need the task's util for capacity_spare_wake @@ -6951,7 +6814,6 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f sync_entity_load_avg(&p->se); } - if (!sd) { if (sd_flag & SD_BALANCE_WAKE) /* XXX always ? */ new_cpu = select_idle_sibling(p, prev_cpu, new_cpu); @@ -6959,9 +6821,6 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f } else { new_cpu = find_idlest_cpu(sd, p, cpu, prev_cpu, sd_flag); } - - -unlock: rcu_read_unlock(); return new_cpu; @@ -7219,12 +7078,14 @@ pick_next_task_fair(struct rq *rq, struct task_struct *prev) } while (cfs_rq); p = task_of(se); + #ifdef CONFIG_HW_VIP_THREAD /* * * pick vip or temp vip thread */ pick_vip_thread(rq, &p, &se); #endif + /* * Since we haven't yet done put_prev_entity and if the selected task * is a different task than we started out with, try and touch the @@ -7635,15 +7496,11 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env) /* * We do not migrate tasks that are: - * 1) energy_aware is enabled and small task is not migrated to higher - * capacity CPU - * 2) throttled_lb_pair, or - * 3) cannot be migrated to this CPU due to cpus_allowed, or - * 4) running (obviously), or - * 5) are cache-hot on their current CPU. + * 1) throttled_lb_pair, or + * 2) cannot be migrated to this CPU due to cpus_allowed, or + * 3) running (obviously), or + * 4) are cache-hot on their current CPU. */ - - if (throttled_lb_pair(task_group(p), env->src_cpu, env->dst_cpu)) return 0; @@ -7754,55 +7611,6 @@ static struct task_struct *detach_one_task(struct lb_env *env) static const unsigned int sched_nr_migrate_break = 32; -/* must hold runqueue lock for queue se is currently on */ -static struct task_struct *hisi_get_heaviest_task( - struct task_struct *p, int cpu) -{ - int num_tasks = 5; - struct sched_entity *se = &p->se; - unsigned long int max_util = task_util(p), max_preferred_util= 0, util; - struct task_struct *tsk, *max_preferred_tsk = NULL, *max_util_task = p; - - /* The currently running task is not on the runqueue */ - se = __pick_first_entity(cfs_rq_of(se)); - - while (num_tasks && se) { - if (!entity_is_task(se)) { - se = __pick_next_entity(se); - num_tasks--; - continue; - } - - tsk = task_of(se); - util = boosted_task_util(tsk); -#ifdef CONFIG_CGROUP_SCHEDTUNE - bool boosted = schedtune_task_boost(tsk) > 0; - bool prefer_idle = schedtune_prefer_idle(tsk) > 0; -#else - bool boosted = 0; - bool prefer_idle = 0; -#endif - - if (cpumask_test_cpu(cpu, tsk_cpus_allowed(tsk))) { - if (boosted || prefer_idle) { - if (util > max_preferred_util) { - max_preferred_util = util;; - max_preferred_tsk = tsk; - } - } else { - if (util > max_util) { - max_util = util; - max_util_task = tsk; - } - } - } - - se = __pick_next_entity(se); - num_tasks--; - } - - return max_preferred_tsk ? max_preferred_tsk : max_util_task; -} /* * detach_tasks() -- tries to detach up to imbalance weighted load from * busiest_rq, as part of a balancing operation within domain "sd". @@ -7843,25 +7651,6 @@ static int detach_tasks(struct lb_env *env) break; } -#ifdef CONFIG_HISI_EAS_SCHED - if (energy_aware() && - (capacity_orig_of(env->dst_cpu) > capacity_orig_of(env->src_cpu))) { - p = hisi_get_heaviest_task(p, env->dst_cpu); - -#ifdef CONFIG_CGROUP_SCHEDTUNE - bool boosted = schedtune_task_boost(p) > 0; - bool prefer_idle = schedtune_prefer_idle(p) > 0; -#else - bool boosted = 0; - bool prefer_idle = 0; -#endif - if (!boosted && !prefer_idle && - task_util(p) * 100 < capacity_orig_of(env->src_cpu) * up_migration_util_filter) - goto next; - - } -#endif - if (!can_migrate_task(p, env)) goto next; @@ -8084,7 +7873,6 @@ struct sd_lb_stats { struct sched_group *local; /* Local group in this sd */ unsigned long total_load; /* Total load of all groups in sd */ unsigned long total_capacity; /* Total capacity of all groups in sd */ - unsigned long total_util; /* Total util of all groups in sd */ unsigned long avg_load; /* Average load across all groups in sd */ struct sg_lb_stats busiest_stat;/* Statistics of the busiest group */ @@ -8104,7 +7892,6 @@ static inline void init_sd_lb_stats(struct sd_lb_stats *sds) .local = NULL, .total_load = 0UL, .total_capacity = 0UL, - .total_util = 0UL, .busiest_stat = { .avg_load = 0UL, .sum_nr_running = 0, @@ -8194,9 +7981,6 @@ static void update_cpu_capacity(struct sched_domain *sd, int cpu) cpu_rq(cpu)->cpu_capacity_orig = capacity; - capacity *= arch_scale_max_freq_capacity(sd, cpu); - capacity >>= SCHED_CAPACITY_SHIFT; - mcc = &cpu_rq(cpu)->rd->max_cpu_capacity; raw_spin_lock_irqsave(&mcc->lock, flags); @@ -8402,7 +8186,8 @@ group_is_overloaded(struct lb_env *env, struct sg_lb_stats *sgs) static inline bool group_smaller_cpu_capacity(struct sched_group *sg, struct sched_group *ref) { - return sg->sgc->max_capacity < ref->sgc->max_capacity; + return sg->sgc->max_capacity + capacity_margin - SCHED_LOAD_SCALE < + ref->sgc->max_capacity; } static inline enum @@ -8466,7 +8251,7 @@ static inline void update_cpu_stats_if_tickless(struct rq *rq) { } static inline void update_sg_lb_stats(struct lb_env *env, struct sched_group *group, int load_idx, int local_group, struct sg_lb_stats *sgs, - bool *overload, bool *overutilized, bool *misfit_task) + bool *overload, bool *overutilized) { unsigned long load; int i, nr_running; @@ -8504,23 +8289,11 @@ static inline void update_sg_lb_stats(struct lb_env *env, /* * No need to call idle_cpu() if nr_running is not 0 */ - if (!nr_running && idle_cpu(i)) { + if (!nr_running && idle_cpu(i)) sgs->idle_cpus++; - /* update idle CPU blocked load */ - if (cpu_util(i)) - update_blocked_averages(i); - } - - if (cpu_overutilized(i) && !idle_cpu(i)) { + if (cpu_overutilized(i)) { *overutilized = true; - /* - * If the cpu is overutilized and if there is only one - * current task in cfs runqueue, it is potentially a misfit - * task. - */ - if (rq->cfs.h_nr_running == 1) - *misfit_task = true; if (!sgs->group_misfit_task && rq->misfit_task) sgs->group_misfit_task = capacity_of(i); } @@ -8650,11 +8423,11 @@ static inline enum fbq_type fbq_classify_rq(struct rq *rq) */ static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sds) { - struct sched_domain *child = env->sd->child, *sd; + struct sched_domain *child = env->sd->child; struct sched_group *sg = env->sd->groups; struct sg_lb_stats tmp_sgs; int load_idx, prefer_sibling = 0; - bool overload = false, overutilized = false, misfit_task = false; + bool overload = false, overutilized = false; if (child && child->flags & SD_PREFER_SIBLING) prefer_sibling = 1; @@ -8676,8 +8449,7 @@ static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sd } update_sg_lb_stats(env, sg, load_idx, local_group, sgs, - &overload, &overutilized, - &misfit_task); + &overload, &overutilized); if (local_group) goto next_group; @@ -8717,7 +8489,6 @@ static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sd /* Now, start updating sd_lb_stats */ sds->total_load += sgs->group_load; sds->total_capacity += sgs->group_capacity; - sds->total_util += sgs->group_util; sg = sg->next; } while (sg != env->sd->groups); @@ -8731,48 +8502,19 @@ static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sd /* update overload indicator if we are at root domain */ if (env->dst_rq->rd->overload != overload) env->dst_rq->rd->overload = overload; - } - if (overutilized) - set_sd_overutilized(env->sd); - else - clear_sd_overutilized(env->sd); - /* - * If there is a misfit task in one cpu in this sched_domain - * it is likely that the imbalance cannot be sorted out among - * the cpu's in this sched_domain. In this case set the - * overutilized flag at the parent sched_domain. - */ - if (misfit_task) { - - sd = env->sd->parent; - - /* - * In case of a misfit task, load balance at the parent - * sched domain level will make sense only if the the cpus - * have a different capacity. If cpus at a domain level have - * the same capacity, the misfit task cannot be well - * accomodated in any of the cpus and there in no point in - * trying a load balance at this level - */ - while (sd) { - if (sd->flags & SD_ASYM_CPUCAPACITY) { - set_sd_overutilized(sd); - break; - } - sd = sd->parent; + /* Update over-utilization (tipping point, U >= 0) indicator */ + if (env->dst_rq->rd->overutilized != overutilized) { + env->dst_rq->rd->overutilized = overutilized; + trace_sched_overutilized(overutilized); + } + } else { + if (!env->dst_rq->rd->overutilized && overutilized) { + env->dst_rq->rd->overutilized = true; + trace_sched_overutilized(true); } } - /* If the domain util is greater that domain capacity, load balancing - * needs to be done at the next sched domain level as well - */ -#ifdef CONFIG_HISI_EAS_SCHED - if (sds->total_capacity * 1024 < sds->total_util * sd_capacity_margin) -#else - if (sds->total_capacity * 1024 < sds->total_util * capacity_margin) -#endif - set_sd_overutilized(env->sd->parent); } /** @@ -8837,12 +8579,6 @@ void fix_small_imbalance(struct lb_env *env, struct sd_lb_stats *sds) local = &sds->local_stat; busiest = &sds->busiest_stat; - if (busiest->sum_nr_running >= busiest->group_weight && - local->sum_nr_running < local->group_weight) { - env->imbalance = busiest->load_per_task; - return; - } - if (!local->sum_nr_running) local->load_per_task = cpu_avg_load_per_task(env->dst_cpu); else if (busiest->load_per_task > local->load_per_task) @@ -9021,10 +8757,8 @@ static struct sched_group *find_busiest_group(struct lb_env *env) */ update_sd_lb_stats(env, &sds); - if (energy_aware() && ((env->sd->flags & SD_ASYM_CPUCAPACITY) || env->idle == CPU_NOT_IDLE)) { - if (!is_sd_overutilized(env->sd)) - goto out_balanced; - } + if (energy_aware() && !env->dst_rq->rd->overutilized) + goto out_balanced; local = &sds.local_stat; busiest = &sds.busiest_stat; @@ -9084,22 +8818,10 @@ static struct sched_group *find_busiest_group(struct lb_env *env) * significant if the diff is greater than 1 otherwise we * might end up to just move the imbalance on another group */ -#ifdef CONFIG_HISI_EAS_SCHED if ((busiest->group_type != group_overloaded) && (local->idle_cpus <= (busiest->idle_cpus + 1)) && !group_smaller_cpu_capacity(sds.busiest, sds.local)) goto out_balanced; -#else - /* busiest->group_type is group_other, if the CPU is in the same frequency doamin - * then the load balance will be aborted. - * - * CPU 4 have a lot of threads but the CPU util is low, the group_type will be assiged - * to group_other. NOHZ idle balance will be needed to spread tasks out. - */ - if ((local->idle_cpus <= (busiest->idle_cpus + 1)) && - busiest->sum_nr_running <= busiest->group_weight) - goto out_balanced; -#endif } else { /* * In the CPU_NEWLY_IDLE, CPU_NOT_IDLE cases, use @@ -9232,14 +8954,11 @@ static int need_active_balance(struct lb_env *env) return 1; } - if ((capacity_orig_of(env->src_cpu) < capacity_orig_of(env->dst_cpu)) && - env->src_rq->cfs.h_nr_running == 1 && - cpu_overutilized(env->src_cpu)) { - - if (idle_cpu(env->dst_cpu)) - return 1; - - if (!idle_cpu(env->dst_cpu) && !cpu_overutilized(env->dst_cpu)) + if ((capacity_of(env->src_cpu) < capacity_of(env->dst_cpu)) && + ((capacity_orig_of(env->src_cpu) < capacity_orig_of(env->dst_cpu))) && + env->src_rq->cfs.h_nr_running == 1 && + cpu_overutilized(env->src_cpu) && + !cpu_overutilized(env->dst_cpu)) { return 1; } @@ -9749,9 +9468,8 @@ static int active_load_balance_cpu_stop(void *data) update_rq_clock(busiest_rq); p = detach_one_task(&env); - if (p) { + if (p) schedstat_inc(sd, alb_pushed); - } else schedstat_inc(sd, alb_failed); } @@ -9790,109 +9508,9 @@ static inline int on_null_domain(struct rq *rq) * needed, they will kick the idle load balancer, which then does idle * load balancing for all the idle CPUs. */ - -#ifdef CONFIG_HISI_EAS_SCHED -/* - * Reset balance_interval at all sched_domain levels of given cpu, so that it - * honors kick. - */ -static inline void reset_balance_interval(int cpu) -{ - struct sched_domain *sd; - - if (cpu >= nr_cpu_ids) - return; - - rcu_read_lock(); - for_each_domain(cpu, sd) - sd->balance_interval = 0; - rcu_read_unlock(); -} - -#define NOHZ_KICK_ANY 0 -#define NOHZ_KICK_RESTRICT 1 -#define NOHZ_KICK_BOOST 2 - -static inline int nohz_kick_type(int call_cpu, struct sched_domain *sd) -{ - int type = NOHZ_KICK_ANY; - int i; - - if (hisi_test_fast_cpu(call_cpu)) - return NOHZ_KICK_ANY; - - if (energy_aware() && cpu_rq(call_cpu)->misfit_task) { - type = NOHZ_KICK_ANY; - } else if (!is_sd_overutilized(sd) && !cpu_overutilized(call_cpu)) { - type = NOHZ_KICK_RESTRICT; - } else { - for_each_cpu(i, sched_domain_span(sd)) { - - if (cpu_util(i) * sd_capacity_margin < capacity_orig_of(i) * 1024) { - /* Change the kick type to limit to CPUs that - * are of equal or lower capacity. - */ - type = NOHZ_KICK_RESTRICT; - break; - } - } - } - - return type; -} - -static inline int hisi_find_new_ilb(void) -{ - struct sched_domain *sd; - int call_cpu = smp_processor_id(); - int type = NOHZ_KICK_ANY; - int ilb = nr_cpu_ids; - bool ilb_found = false; - - rcu_read_lock(); - - sd = rcu_dereference_check_sched_domain(cpu_rq(call_cpu)->sd); - if (!sd) { - rcu_read_unlock(); - return nr_cpu_ids; - } - - type = nohz_kick_type(call_cpu, sd); - - for_each_domain(call_cpu, sd) { - for_each_cpu_and(ilb, nohz.idle_cpus_mask, sched_domain_span(sd)) { - if (idle_cpu(ilb)) { - bool is_bigger_cpu = capacity_orig_of(ilb) > capacity_orig_of(call_cpu); - - if ((type == NOHZ_KICK_ANY) || - (type == NOHZ_KICK_BOOST && is_bigger_cpu) || - (type == NOHZ_KICK_RESTRICT && !is_bigger_cpu)) { - ilb_found = true; - break; - } - - } - } - - if (ilb_found) - break; - } - - rcu_read_unlock(); - - reset_balance_interval(ilb); - - return ilb; -} -#endif - static inline int find_new_ilb(void) { -#ifdef CONFIG_HISI_EAS_SCHED - int ilb = hisi_find_new_ilb(); -#else int ilb = cpumask_first(nohz.idle_cpus_mask); -#endif if (ilb < nr_cpu_ids && idle_cpu(ilb)) return ilb; @@ -10048,11 +9666,6 @@ static void rebalance_domains(struct rq *rq, enum cpu_idle_type idle) rcu_read_lock(); for_each_domain(cpu, sd) { - if (energy_aware() && ((sd->flags & SD_ASYM_CPUCAPACITY) || idle == CPU_NOT_IDLE)) { - if (!is_sd_overutilized(sd)) - continue; - } - /* * Decay the newidle max times here because this is a regular * visit to all the domains. Decay ~1% per second. @@ -10237,20 +9850,11 @@ static inline bool nohz_kick_needed(struct rq *rq) if (likely(!atomic_read(&nohz.nr_cpus))) return false; -#ifdef CONFIG_HISI_EAS_SCHED - if (rq->nr_running >= 2) - return true; -#endif - if (time_before(now, nohz.next_balance)) return false; if (rq->nr_running >= 2 && - (!energy_aware() || cpu_overutilized(cpu))) - return true; - - /* Do idle load balance if there have misfit task */ - if (energy_aware() && rq->misfit_task) + (!energy_aware() || cpu_overutilized(cpu))) return true; /* Do idle load balance if there have misfit task */ @@ -10398,7 +10002,6 @@ static void task_tick_fair(struct rq *rq, struct task_struct *curr, int queued) { struct cfs_rq *cfs_rq; struct sched_entity *se = &curr->se; - struct sched_domain *sd; for_each_sched_entity(se) { cfs_rq = cfs_rq_of(se); @@ -10409,12 +10012,10 @@ static void task_tick_fair(struct rq *rq, struct task_struct *curr, int queued) task_tick_numa(rq, curr); #ifdef CONFIG_SMP - rcu_read_lock(); - sd = rcu_dereference(rq->sd); - if (!is_sd_overutilized(sd) && - cpu_overutilized(task_cpu(curr))) - set_sd_overutilized(sd); - rcu_read_unlock(); + if (!rq->rd->overutilized && cpu_overutilized(task_cpu(curr))) { + rq->rd->overutilized = true; + trace_sched_overutilized(true); + } rq->misfit_task = !task_fits_max(curr, rq->cpu); #endif @@ -10953,154 +10554,3 @@ __init void init_sched_fair_class(void) #endif /* SMP */ } - -#ifdef CONFIG_HISI_EAS_SCHED -#define EAS_DATA_SYSFS_MAX 5 - -struct eas_global_attr { - struct attribute attr; - ssize_t (*show)(struct kobject *kobj, - struct attribute *attr, char *buf); - ssize_t (*store)(struct kobject *a, struct attribute *b, - const char *c, size_t count); - int *value; - int (*to_sysfs)(int); - int (*from_sysfs)(int); - ssize_t (*to_sysfs_text)(char *buf, int buf_size); -}; - -struct eas_data_struct { - int multiplier; /* used to scale the time delta */ - struct attribute_group attr_group; - struct attribute *attributes[EAS_DATA_SYSFS_MAX + 1]; - struct eas_global_attr attr[EAS_DATA_SYSFS_MAX]; -} eas_data; - -static ssize_t eas_show(struct kobject *kobj, - struct attribute *attr, char *buf) -{ - struct eas_global_attr *eas_attr = - container_of(attr, struct eas_global_attr, attr); - int temp; - - if (eas_attr->to_sysfs_text != NULL) - return eas_attr->to_sysfs_text(buf, PAGE_SIZE); - - temp = *(eas_attr->value); - if (eas_attr->to_sysfs != NULL) - temp = eas_attr->to_sysfs(temp); - - return (ssize_t)sprintf(buf, "%d\n", temp); -} - -static ssize_t eas_store(struct kobject *a, struct attribute *attr, - const char *buf, size_t count) -{ - int temp; - ssize_t ret = count; - struct eas_global_attr *eas_attr = - container_of(attr, struct eas_global_attr, attr); - char *str = vmalloc(count + 1); - - if (str == NULL) - return -ENOMEM; - - memcpy(str, buf, count); - str[count] = 0; - if (sscanf(str, "%d", &temp) < 1) - ret = -EINVAL; - else { - if (eas_attr->from_sysfs != NULL) - temp = eas_attr->from_sysfs(temp); - if (temp < 0) - ret = -EINVAL; - else - *(eas_attr->value) = temp; - } - - /* trace the name and value of the attribute */ - trace_eas_attr_store(attr->name, temp); - vfree(str); - return ret; -} - -static void eas_attr_add( - const char *name, - int *value, - int (*to_sysfs)(int), - int (*from_sysfs)(int), - ssize_t (*to_sysfs_text)(char *, int), - umode_t mode) -{ - int i = 0; - - while (eas_data.attributes[i] != NULL) { - i++; - if (i >= EAS_DATA_SYSFS_MAX) - return; - } - if (mode) - eas_data.attr[i].attr.mode = mode; - else - eas_data.attr[i].attr.mode = 0644; - eas_data.attr[i].show = eas_show; - eas_data.attr[i].store = eas_store; - eas_data.attr[i].attr.name = name; - eas_data.attr[i].value = value; - eas_data.attr[i].to_sysfs = to_sysfs; - eas_data.attr[i].from_sysfs = from_sysfs; - eas_data.attr[i].to_sysfs_text = to_sysfs_text; - eas_data.attributes[i] = &eas_data.attr[i].attr; - eas_data.attributes[i + 1] = NULL; -} - -static int eas_attr_init(void) -{ - int ret; - - memset(&eas_data, 0, sizeof(eas_data)); - - eas_attr_add("boost", - &global_boost_enabled_flag, - NULL, - NULL, - NULL, - 0644); - - eas_attr_add("up_migration_util_filter", - &up_migration_util_filter, - NULL, - NULL, - NULL, - 0644); - - eas_attr_add("sd_capacity_margin", - &sd_capacity_margin, - NULL, - NULL, - NULL, - 0644); - - eas_attr_add("capacity_margin", - &capacity_margin, - NULL, - NULL, - NULL, - 0644); - - eas_attr_add("boot_boost", - &boot_boost, - NULL, - NULL, - NULL, - 0644); - - eas_data.attr_group.name = "eas"; - eas_data.attr_group.attrs = eas_data.attributes; - ret = sysfs_create_group(kernel_kobj, - &eas_data.attr_group); - - return 0; -} -late_initcall(eas_attr_init); -#endif From 1ef6bf68a3d990791d8319eee0a7d69b5f0856c0 Mon Sep 17 00:00:00 2001 From: UchihaDev Date: Wed, 2 May 2018 18:58:35 +0545 Subject: [PATCH 135/146] sched/features: rebase on aosp --- kernel/sched/features.h | 29 ++++------------------------- 1 file changed, 4 insertions(+), 25 deletions(-) diff --git a/kernel/sched/features.h b/kernel/sched/features.h index 788c0b9ad..03863fe67 100644 --- a/kernel/sched/features.h +++ b/kernel/sched/features.h @@ -73,30 +73,9 @@ SCHED_FEAT(ATTACH_AGE_LOAD, true) * Energy aware scheduling. Use platform energy model to guide scheduling * decisions optimizing for energy efficiency. */ +#ifdef CONFIG_DEFAULT_USE_ENERGY_AWARE SCHED_FEAT(ENERGY_AWARE, true) -/* - * Minimum capacity capping. Keep track of minimum capacity factor when - * minimum frequency available to a policy is modified. - * If enabled, this can be used to inform the scheduler about capacity - * restrictions. - */ -SCHED_FEAT(MIN_CAPACITY_CAPPING, true) - -/* - * Enforce the priority of candidates selected by find_best_target() - * ON: If the target CPU saves any energy, use that. - * OFF: Use whichever of target or backup saves most. - */ -SCHED_FEAT(FBT_STRICT_ORDER, false) - -/* - * Apply schedtune boost hold to tasks of all sched classes. - * If enabled, schedtune will hold the boost applied to a CPU - * for 50ms regardless of task activation - if the task is - * still running 50ms later, the boost hold expires and schedtune - * boost will expire immediately the task stops. - * If disabled, this behaviour will only apply to tasks of the - * RT class. - */ -SCHED_FEAT(SCHEDTUNE_BOOST_HOLD_ALL, true) +#else +SCHED_FEAT(ENERGY_AWARE, false) +#endif From 3a25874b3f99f294513ef06d4e9372baaa8834cc Mon Sep 17 00:00:00 2001 From: UchihaDev Date: Wed, 2 May 2018 19:00:41 +0545 Subject: [PATCH 136/146] sched/loadavg: rebase on aosp --- kernel/sched/loadavg.c | 19 ++++++++----------- 1 file changed, 8 insertions(+), 11 deletions(-) diff --git a/kernel/sched/loadavg.c b/kernel/sched/loadavg.c index 8ec471cb8..dfeedd9fa 100644 --- a/kernel/sched/loadavg.c +++ b/kernel/sched/loadavg.c @@ -168,7 +168,7 @@ static inline int calc_load_write_idx(void) * If the folding window started, make sure we start writing in the * next idle-delta. */ - if (!time_before(jiffies, READ_ONCE(calc_load_update))) + if (!time_before(jiffies, calc_load_update)) idx++; return idx & 1; @@ -203,7 +203,7 @@ void calc_load_exit_idle(void) /* * If we're still before the pending sample window, we're done. */ - this_rq->calc_load_update = READ_ONCE(calc_load_update); + this_rq->calc_load_update = calc_load_update; if (time_before(jiffies, this_rq->calc_load_update)) return; @@ -307,15 +307,14 @@ calc_load_n(unsigned long load, unsigned long exp, */ static void calc_global_nohz(void) { - unsigned long sample_window; long delta, active, n; - sample_window = READ_ONCE(calc_load_update); - if (!time_before(jiffies, sample_window + 10)) { + if (!time_before(jiffies, calc_load_update + 10)) { + /* * Catch-up, fold however many we are behind still */ - delta = jiffies - sample_window - 10; + delta = jiffies - calc_load_update - 10; n = 1 + (delta / LOAD_FREQ); active = atomic_long_read(&calc_load_tasks); @@ -325,7 +324,7 @@ static void calc_global_nohz(void) avenrun[1] = calc_load_n(avenrun[1], EXP_5, active, n); avenrun[2] = calc_load_n(avenrun[2], EXP_15, active, n); - WRITE_ONCE(calc_load_update, sample_window + n * LOAD_FREQ); + calc_load_update += n * LOAD_FREQ; } /* @@ -353,11 +352,9 @@ static inline void calc_global_nohz(void) { } */ void calc_global_load(unsigned long ticks) { - unsigned long sample_window; long active, delta; - sample_window = READ_ONCE(calc_load_update); - if (time_before(jiffies, sample_window + 10)) + if (time_before(jiffies, calc_load_update + 10)) return; /* @@ -374,7 +371,7 @@ void calc_global_load(unsigned long ticks) avenrun[1] = calc_load(avenrun[1], EXP_5, active); avenrun[2] = calc_load(avenrun[2], EXP_15, active); - WRITE_ONCE(calc_load_update, sample_window + LOAD_FREQ); + calc_load_update += LOAD_FREQ; /* * In case we idled for multiple LOAD_FREQ intervals, catch up in bulk. From a8eefde024c951220e19e143fa46a64707c87d6a Mon Sep 17 00:00:00 2001 From: UchihaDev Date: Wed, 2 May 2018 19:01:39 +0545 Subject: [PATCH 137/146] sched/rt: rebase on aosp --- kernel/sched/rt.c | 9 --------- 1 file changed, 9 deletions(-) diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c index 41926fd70..ff2623b69 100644 --- a/kernel/sched/rt.c +++ b/kernel/sched/rt.c @@ -1368,10 +1368,6 @@ enqueue_task_rt(struct rq *rq, struct task_struct *p, int flags) { struct sched_rt_entity *rt_se = &p->rt; -#ifdef CONFIG_SMP - schedtune_enqueue_task(p, cpu_of(rq)); -#endif - if (flags & ENQUEUE_WAKEUP) rt_se->timeout = 0; @@ -1413,10 +1409,6 @@ static void dequeue_task_rt(struct rq *rq, struct task_struct *p, int flags) { struct sched_rt_entity *rt_se = &p->rt; -#ifdef CONFIG_SMP - schedtune_dequeue_task(p, cpu_of(rq)); -#endif - update_curr_rt(rq); dequeue_rt_entity(rt_se); walt_dec_cumulative_runnable_avg(rq, p); @@ -1868,7 +1860,6 @@ static struct rq *find_lock_lowest_rq(struct task_struct *task, struct rq *rq) !cpumask_test_cpu(lowest_rq->cpu, tsk_cpus_allowed(task)) || task_running(rq, task) || - !rt_task(task) || !task_on_rq_queued(task))) { double_unlock_balance(rq, lowest_rq); From 70bf6c387df2d8e25fa71eb5a854e51b4e4ecc91 Mon Sep 17 00:00:00 2001 From: UchihaDev Date: Wed, 2 May 2018 19:14:33 +0545 Subject: [PATCH 138/146] sched/sched.h: rebase on aosp --- kernel/sched/sched.h | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index faf51786c..4c34fb034 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -748,8 +748,10 @@ struct rq { unsigned int ttwu_count; unsigned int ttwu_local; +#ifdef CONFIG_SMP struct eas_stats eas_stats; #endif +#endif #ifdef CONFIG_SMP struct llist_head wake_list; @@ -1047,7 +1049,11 @@ static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu) * per-task data have been completed by this moment. */ smp_wmb(); +#ifdef CONFIG_THREAD_INFO_IN_TASK + p->cpu = cpu; +#else task_thread_info(p)->cpu = cpu; +#endif p->wake_cpu = cpu; #endif } @@ -1251,7 +1257,6 @@ static const u32 prio_to_wmult[40] = { #define DEQUEUE_SLEEP 0x01 #define DEQUEUE_SAVE 0x02 -#define DEQUEUE_IDLE 0x80 /* The last dequeue before IDLE */ #define RETRY_TASK ((void *)-1UL) From 01df1216755e35ae1edea30bc65eeab4f19d4245 Mon Sep 17 00:00:00 2001 From: UchihaDev Date: Wed, 2 May 2018 19:16:58 +0545 Subject: [PATCH 139/146] sched/stats: rebase on aosp --- kernel/sched/stats.c | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/kernel/sched/stats.c b/kernel/sched/stats.c index 046a3deb2..6d74a7c77 100644 --- a/kernel/sched/stats.c +++ b/kernel/sched/stats.c @@ -12,6 +12,7 @@ */ #define SCHEDSTAT_VERSION 15 +#ifdef CONFIG_SMP static inline void show_easstat(struct seq_file *seq, struct eas_stats *stats) { /* eas-specific runqueue stats */ @@ -24,15 +25,14 @@ static inline void show_easstat(struct seq_file *seq, struct eas_stats *stats) stats->secb_insuff_cap, stats->secb_no_nrg_sav, stats->secb_nrg_sav, stats->secb_count); - seq_printf(seq, "%llu %llu %llu %llu %llu %llu %llu %llu ", + seq_printf(seq, "%llu %llu %llu %llu %llu ", stats->fbt_attempts, stats->fbt_no_cpu, stats->fbt_no_sd, - stats->fbt_pref_idle, stats->fbt_pref_idle_lum, - stats->fbt_best_active, stats->fbt_best_idle, - stats->fbt_count); + stats->fbt_pref_idle, stats->fbt_count); seq_printf(seq, "%llu %llu\n", stats->cas_attempts, stats->cas_count); } +#endif static int show_schedstat(struct seq_file *seq, void *v) { @@ -61,8 +61,9 @@ static int show_schedstat(struct seq_file *seq, void *v) seq_printf(seq, "\n"); - show_easstat(seq, &rq->eas_stats); #ifdef CONFIG_SMP + show_easstat(seq, &rq->eas_stats); + /* domain-specific stats */ rcu_read_lock(); for_each_domain(cpu, sd) { From 688c16572725f77d344d75ed286f1c9889c444c2 Mon Sep 17 00:00:00 2001 From: UchihaDev Date: Wed, 2 May 2018 19:17:19 +0545 Subject: [PATCH 140/146] delete kernel/sched/swait.c --- kernel/sched/swait.c | 117 ------------------------------------------- 1 file changed, 117 deletions(-) delete mode 100644 kernel/sched/swait.c diff --git a/kernel/sched/swait.c b/kernel/sched/swait.c deleted file mode 100644 index 9c2da06a8..000000000 --- a/kernel/sched/swait.c +++ /dev/null @@ -1,117 +0,0 @@ -#include -#include - -void __init_swait_queue_head(struct swait_queue_head *q, const char *name, - struct lock_class_key *key) -{ - raw_spin_lock_init(&q->lock); - lockdep_set_class_and_name(&q->lock, key, name); - INIT_LIST_HEAD(&q->task_list); -} -EXPORT_SYMBOL(__init_swait_queue_head); - -/* - * The thing about the wake_up_state() return value; I think we can ignore it. - * - * If for some reason it would return 0, that means the previously waiting - * task is already running, so it will observe condition true (or has already). - */ -void swake_up_locked(struct swait_queue_head *q) -{ - struct swait_queue *curr; - - if (list_empty(&q->task_list)) - return; - - curr = list_first_entry(&q->task_list, typeof(*curr), task_list); - wake_up_process(curr->task); - list_del_init(&curr->task_list); -} -EXPORT_SYMBOL(swake_up_locked); - -void swake_up(struct swait_queue_head *q) -{ - unsigned long flags; - - raw_spin_lock_irqsave(&q->lock, flags); - swake_up_locked(q); - raw_spin_unlock_irqrestore(&q->lock, flags); -} -EXPORT_SYMBOL(swake_up); - -/* - * Does not allow usage from IRQ disabled, since we must be able to - * release IRQs to guarantee bounded hold time. - */ -void swake_up_all(struct swait_queue_head *q) -{ - struct swait_queue *curr; - LIST_HEAD(tmp); - - raw_spin_lock_irq(&q->lock); - list_splice_init(&q->task_list, &tmp); - while (!list_empty(&tmp)) { - curr = list_first_entry(&tmp, typeof(*curr), task_list); - - wake_up_state(curr->task, TASK_NORMAL); - list_del_init(&curr->task_list); - - if (list_empty(&tmp)) - break; - - raw_spin_unlock_irq(&q->lock); - raw_spin_lock_irq(&q->lock); - } - raw_spin_unlock_irq(&q->lock); -} -EXPORT_SYMBOL(swake_up_all); - -void __prepare_to_swait(struct swait_queue_head *q, struct swait_queue *wait) -{ - wait->task = current; - if (list_empty(&wait->task_list)) - list_add(&wait->task_list, &q->task_list); -} - -void prepare_to_swait(struct swait_queue_head *q, struct swait_queue *wait, int state) -{ - unsigned long flags; - - raw_spin_lock_irqsave(&q->lock, flags); - __prepare_to_swait(q, wait); - set_current_state(state); - raw_spin_unlock_irqrestore(&q->lock, flags); -} -EXPORT_SYMBOL(prepare_to_swait); - -long prepare_to_swait_event(struct swait_queue_head *q, struct swait_queue *wait, int state) -{ - if (signal_pending_state(state, current)) - return -ERESTARTSYS; - - prepare_to_swait(q, wait, state); - - return 0; -} -EXPORT_SYMBOL(prepare_to_swait_event); - -void __finish_swait(struct swait_queue_head *q, struct swait_queue *wait) -{ - __set_current_state(TASK_RUNNING); - if (!list_empty(&wait->task_list)) - list_del_init(&wait->task_list); -} - -void finish_swait(struct swait_queue_head *q, struct swait_queue *wait) -{ - unsigned long flags; - - __set_current_state(TASK_RUNNING); - - if (!list_empty_careful(&wait->task_list)) { - raw_spin_lock_irqsave(&q->lock, flags); - list_del_init(&wait->task_list); - raw_spin_unlock_irqrestore(&q->lock, flags); - } -} -EXPORT_SYMBOL(finish_swait); From 8d0fd18e52c8c9e72dc36a79844a333a96871fe3 Mon Sep 17 00:00:00 2001 From: UchihaDev Date: Wed, 2 May 2018 19:28:46 +0545 Subject: [PATCH 141/146] sched/tune: rebase on aosp --- kernel/sched/tune.c | 193 ++++++++++---------------------------------- 1 file changed, 42 insertions(+), 151 deletions(-) diff --git a/kernel/sched/tune.c b/kernel/sched/tune.c index 4b270bd7e..b59fbd10d 100644 --- a/kernel/sched/tune.c +++ b/kernel/sched/tune.c @@ -117,64 +117,6 @@ __schedtune_accept_deltas(int nrg_delta, int cap_delta, /* * EAS scheduler tunables for task groups. - * - * When CGroup support is enabled, we have to synchronize two different - * paths: - * - slow path: where CGroups are created/updated/removed - * - fast path: where tasks in a CGroups are accounted - * - * The slow path tracks (a limited number of) CGroups and maps each on a - * "boost_group" index. The fastpath accounts tasks currently RUNNABLE on each - * "boost_group". - * - * Once a new CGroup is created, a boost group idx is assigned and the - * corresponding "boost_group" marked as valid on each CPU. - * Once a CGroup is release, the corresponding "boost_group" is marked as - * invalid on each CPU. The CPU boost value (boost_max) is aggregated by - * considering only valid boost_groups with a non null tasks counter. - * - * .:: Locking strategy - * - * The fast path uses a spin lock for each CPU boost_group which protects the - * tasks counter. - * - * The "valid" and "boost" values of each CPU boost_group is instead - * protected by the RCU lock provided by the CGroups callbacks. Thus, only the - * slow path can access and modify the boost_group attribtues of each CPU. - * The fast path will catch up the most updated values at the next scheduling - * event (i.e. enqueue/dequeue). - * - * | - * SLOW PATH | FAST PATH - * CGroup add/update/remove | Scheduler enqueue/dequeue events - * | - * | - * | DEFINE_PER_CPU(struct boost_groups) - * | +--------------+----+---+----+----+ - * | | idle | | | | | - * | | boost_max | | | | | - * | +---->lock | | | | | - * struct schedtune allocated_groups | | | group[ ] | | | | | - * +------------------------------+ +-------+ | | +--+---------+-+----+---+----+----+ - * | idx | | | | | | valid | - * | boots / prefer_idle | | | | | | boost | - * | perf_{boost/constraints}_idx | <---------+(*) | | | | tasks | <------------+ - * | css | +-------+ | | +---------+ | - * +-+----------------------------+ | | | | | | | - * ^ | | | | | | | - * | +-------+ | | +---------+ | - * | | | | | | | | - * | | | | | | | | - * | +-------+ | | +---------+ | - * | zmalloc | | | | | | | - * | | | | | | | | - * | +-------+ | | +---------+ | - * + BOOSTGROUPS_COUNT | | BOOSTGROUPS_COUNT | - * schedtune_boostgroup_init() | + | - * | schedtune_{en,de}queue_task() | - * | + - * | schedtune_tasks_update() - * | */ /* SchdTune tunables for a group of tasks */ @@ -251,14 +193,13 @@ schedtune_accept_deltas(int nrg_delta, int cap_delta, int perf_constrain_idx; /* Optimal (O) region */ - if (nrg_delta <= 0 && cap_delta >= 0) { + if (nrg_delta < 0 && cap_delta > 0) { trace_sched_tune_filter(nrg_delta, cap_delta, 0, 0, 1, 0); return INT_MAX; } /* Suboptimal (S) region */ - if ((nrg_delta >= 0 && cap_delta < 0) || - (nrg_delta > 0 && cap_delta <= 0)) { + if (nrg_delta > 0 && cap_delta < 0) { trace_sched_tune_filter(nrg_delta, cap_delta, 0, 0, -1, 5); return -INT_MAX; } @@ -285,7 +226,7 @@ schedtune_accept_deltas(int nrg_delta, int cap_delta, * implementation especially for the computation of the per-CPU boost * value */ -#define BOOSTGROUPS_COUNT 8 +#define BOOSTGROUPS_COUNT 5 /* Array of configured boostgroups */ static struct schedtune *allocated_group[BOOSTGROUPS_COUNT] = { @@ -303,17 +244,13 @@ static struct schedtune *allocated_group[BOOSTGROUPS_COUNT] = { */ struct boost_groups { /* Maximum boost value for all RUNNABLE tasks on a CPU */ + bool idle; int boost_max; - u64 boost_ts; struct { - /* True when this boost group maps an actual cgroup */ - bool valid; /* The boost for tasks on that boost group */ int boost; /* Count of RUNNABLE tasks on that boost group */ unsigned tasks; - /* Timestamp of boost activation */ - u64 ts; } group[BOOSTGROUPS_COUNT]; /* CPU's boost group locking */ raw_spinlock_t lock; @@ -322,57 +259,37 @@ struct boost_groups { /* Boost groups affecting each CPU in the system */ DEFINE_PER_CPU(struct boost_groups, cpu_boost_groups); -static inline bool schedtune_boost_timeout(u64 now, u64 ts) -{ - return ((now - ts) > SCHEDTUNE_BOOST_HOLD_NS); -} - -static inline bool -schedtune_boost_group_active(int idx, struct boost_groups* bg, u64 now) -{ - if (bg->group[idx].tasks) - return true; - - return !schedtune_boost_timeout(now, bg->group[idx].ts); -} - static void -schedtune_cpu_update(int cpu, u64 now) +schedtune_cpu_update(int cpu) { struct boost_groups *bg; - u64 boost_ts = now; - int boost_max = INT_MIN; + int boost_max; int idx; bg = &per_cpu(cpu_boost_groups, cpu); for (idx = 0; idx < BOOSTGROUPS_COUNT; ++idx) { - /* Ignore non boostgroups not mapping a cgroup */ - if (!bg->group[idx].valid) - continue; + /* The root boost group is always active */ + boost_max = bg->group[0].boost; + for (idx = 1; idx < BOOSTGROUPS_COUNT; ++idx) { /* * A boost group affects a CPU only if it has * RUNNABLE tasks on that CPU or it has hold * in effect from a previous task. */ - if (!schedtune_boost_group_active(idx, bg, now)) - continue; - - /* this boost group is active */ - if (boost_max > bg->group[idx].boost) + if (bg->group[idx].tasks == 0) continue; - boost_max = bg->group[idx].boost; - boost_ts = bg->group[idx].ts; + boost_max = max(boost_max, bg->group[idx].boost); } - /* If there are no active boost groups on the CPU, set no boost */ - if (boost_max == INT_MIN) - boost_max = 0; + /* Ensures boost_max is non-negative when all cgroup boost values + * are neagtive. Avoids under-accounting of cpu capacity which may cause + * task stacking and frequency spikes.*/ + boost_max = max(boost_max, 0); bg->boost_max = boost_max; - bg->boost_ts = boost_ts; } static int @@ -382,15 +299,11 @@ schedtune_boostgroup_update(int idx, int boost) int cur_boost_max; int old_boost; int cpu; - u64 now; /* Update per CPU boost groups */ for_each_possible_cpu(cpu) { bg = &per_cpu(cpu_boost_groups, cpu); - /* CGroups are never associated to non active cgroups */ - BUG_ON(!bg->group[idx].valid); - /* * Keep track of current boost values to compute the per CPU * maximum only when it has been affected by the new value of @@ -567,19 +480,18 @@ int schedtune_can_attach(struct cgroup_taskset *tset) * current boost group. */ - now = sched_clock_cpu(cpu); - /* Move task from src to dst boost group */ tasks = bg->group[src_bg].tasks - 1; bg->group[src_bg].tasks = max(0, tasks); bg->group[dst_bg].tasks += 1; - bg->group[dst_bg].ts = now; - - /* update next time someone asks */ - bg->boost_ts = now - SCHEDTUNE_BOOST_HOLD_NS; raw_spin_unlock(&bg->lock); unlock_rq_of(rq, task, &irq_flags); + + /* Update CPU boost group */ + if (bg->group[src_bg].tasks == 0 || bg->group[dst_bg].tasks == 1) + schedtune_cpu_update(task_cpu(task)); + } return 0; @@ -660,14 +572,8 @@ void schedtune_exit_task(struct task_struct *tsk) int schedtune_cpu_boost(int cpu) { struct boost_groups *bg; - u64 now; bg = &per_cpu(cpu_boost_groups, cpu); - now = sched_clock_cpu(cpu); - - /* check to see if we have a hold in effect */ - if (schedtune_boost_timeout(now, bg->boost_ts)) - schedtune_cpu_update(cpu, now); return bg->boost_max; } @@ -742,7 +648,7 @@ boost_write(struct cgroup_subsys_state *css, struct cftype *cft, if (boost < -100 || boost > 100) return -EINVAL; - boost_pct = (boost > 0) ? boost : -boost; + boost_pct = boost; /* * Update threshold params for Performance Boost (B) @@ -767,15 +673,7 @@ boost_write(struct cgroup_subsys_state *css, struct cftype *cft, /* Update CPU boost */ schedtune_boostgroup_update(st->idx, st->boost); - /* trace stune_name and value */ - trace_sched_tune_boost(css->cgroup->kn->name, boost); - - trace_sched_tune_config(st->boost, - threshold_gains[st->perf_boost_idx].nrg_gain, - threshold_gains[st->perf_boost_idx].cap_gain, - threshold_gains[st->perf_constrain_idx].nrg_gain, - threshold_gains[st->perf_constrain_idx].cap_gain); - + trace_sched_tune_config(st->boost); return 0; } @@ -793,23 +691,23 @@ static struct cftype files[] = { { } /* terminate */ }; -static void -schedtune_boostgroup_init(struct schedtune *st, int idx) +static int +schedtune_boostgroup_init(struct schedtune *st) { struct boost_groups *bg; int cpu; - /* Initialize per CPUs boost group support */ + /* Keep track of allocated boost groups */ + allocated_group[st->idx] = st; + + /* Initialize the per CPU boost groups */ for_each_possible_cpu(cpu) { bg = &per_cpu(cpu_boost_groups, cpu); - bg->group[idx].boost = 0; - bg->group[idx].valid = true; - bg->group[idx].ts = 0; + bg->group[st->idx].boost = 0; + bg->group[st->idx].tasks = 0; } - /* Keep track of allocated boost groups */ - allocated_group[idx] = st; - st->idx = idx; + return 0; } static struct cgroup_subsys_state * @@ -842,10 +740,14 @@ schedtune_css_alloc(struct cgroup_subsys_state *parent_css) goto out; /* Initialize per CPUs boost group support */ - schedtune_boostgroup_init(st, idx); + st->idx = idx; + if (schedtune_boostgroup_init(st)) + goto release; return &st->css; +release: + kfree(st); out: return ERR_PTR(-ENOMEM); } @@ -853,15 +755,8 @@ schedtune_css_alloc(struct cgroup_subsys_state *parent_css) static void schedtune_boostgroup_release(struct schedtune *st) { - struct boost_groups *bg; - int cpu; - - /* Reset per CPUs boost group support */ - for_each_possible_cpu(cpu) { - bg = &per_cpu(cpu_boost_groups, cpu); - bg->group[st->idx].valid = false; - bg->group[st->idx].boost = 0; - } + /* Reset this boost group */ + schedtune_boostgroup_update(st->idx, 0); /* Keep track of allocated boost groups */ allocated_group[st->idx] = NULL; @@ -872,7 +767,6 @@ schedtune_css_free(struct cgroup_subsys_state *css) { struct schedtune *st = css_st(css); - /* Release per CPUs boost group support */ schedtune_boostgroup_release(st); kfree(st); } @@ -880,7 +774,6 @@ schedtune_css_free(struct cgroup_subsys_state *css) struct cgroup_subsys schedtune_cgrp_subsys = { .css_alloc = schedtune_css_alloc, .css_free = schedtune_css_free, - .allow_attach = schedtune_allow_attach, .can_attach = schedtune_can_attach, .cancel_attach = schedtune_cancel_attach, .legacy_cftypes = files, @@ -897,7 +790,6 @@ schedtune_init_cgroups(void) for_each_possible_cpu(cpu) { bg = &per_cpu(cpu_boost_groups, cpu); memset(bg, 0, sizeof(struct boost_groups)); - bg->group[0].valid = true; raw_spin_lock_init(&bg->lock); } @@ -1116,11 +1008,10 @@ schedtune_add_cluster_nrg( * Assume we have EM data only at the CPU and * the upper CLUSTER level */ - if (sd2->parent) - BUG_ON(!cpumask_equal( - sched_group_cpus(sg), - sched_group_cpus(sd2->parent->groups) - )); + BUG_ON(!cpumask_equal( + sched_group_cpus(sg), + sched_group_cpus(sd2->parent->groups) + )); break; } } From 8f0f8d3fcd99e960cbdfe146d4950c9d174511db Mon Sep 17 00:00:00 2001 From: UchihaDev Date: Wed, 2 May 2018 19:29:53 +0545 Subject: [PATCH 142/146] sched/walt: rebase on aosp --- kernel/sched/walt.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/kernel/sched/walt.c b/kernel/sched/walt.c index bb3922ff3..8d25ffbe4 100644 --- a/kernel/sched/walt.c +++ b/kernel/sched/walt.c @@ -55,7 +55,7 @@ __read_mostly unsigned int walt_ravg_window = static unsigned int sync_cpu; static ktime_t ktime_last; -static __read_mostly bool walt_ktime_suspended; +static bool walt_ktime_suspended; static unsigned int task_load(struct task_struct *p) { @@ -104,8 +104,10 @@ walt_dec_cumulative_runnable_avg(struct rq *rq, static void fixup_cumulative_runnable_avg(struct rq *rq, - struct task_struct *p, s64 task_load_delta) + struct task_struct *p, u64 new_task_load) { + s64 task_load_delta = (s64)new_task_load - task_load(p); + rq->cumulative_runnable_avg += task_load_delta; if ((s64)rq->cumulative_runnable_avg < 0) panic("cra less than zero: tld: %lld, task_load(p) = %u\n", @@ -215,7 +217,6 @@ update_window_start(struct rq *rq, u64 wallclock) nr_windows = div64_u64(delta, walt_ravg_window); rq->window_start += (u64)nr_windows * (u64)walt_ravg_window; - cpufreq_update_util(rq, 0); rq->cum_window_demand = rq->cumulative_runnable_avg; } From 306643c3dfe1dc8d52d47befde23ee9b9060c724 Mon Sep 17 00:00:00 2001 From: UchihaDev Date: Wed, 2 May 2018 20:16:38 +0545 Subject: [PATCH 143/146] sched/Makefile: rebase on aosp --- kernel/sched/Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/sched/Makefile b/kernel/sched/Makefile index 5bc46e18d..7a060dbdb 100644 --- a/kernel/sched/Makefile +++ b/kernel/sched/Makefile @@ -19,7 +19,7 @@ endif obj-y += core.o loadavg.o clock.o cputime.o obj-y += idle_task.o fair.o rt.o deadline.o stop_task.o -obj-y += wait.o swait.o completion.o idle.o +obj-y += wait.o completion.o idle.o obj-$(CONFIG_SMP) += cpupri.o cpudeadline.o energy.o obj-$(CONFIG_SCHED_WALT) += walt.o obj-$(CONFIG_SCHED_AUTOGROUP) += auto_group.o From b5c69cd564375d2b4e1b598a1693713ea2393ef6 Mon Sep 17 00:00:00 2001 From: UchihaDev Date: Wed, 2 May 2018 21:29:20 +0545 Subject: [PATCH 144/146] sched/tune: rebase on aosp --- kernel/sched/tune.c | 28 ++++++++-------------------- 1 file changed, 8 insertions(+), 20 deletions(-) diff --git a/kernel/sched/tune.c b/kernel/sched/tune.c index b59fbd10d..31de5d56f 100644 --- a/kernel/sched/tune.c +++ b/kernel/sched/tune.c @@ -315,14 +315,9 @@ schedtune_boostgroup_update(int idx, int boost) /* Update the boost value of this boost group */ bg->group[idx].boost = boost; - now = sched_clock_cpu(cpu); - /* - * Check if this update increase current max. - */ - if (boost > cur_boost_max && - schedtune_boost_group_active(idx, bg, now)) { + /* Check if this update increase current max */ + if (boost > cur_boost_max && bg->group[idx].tasks) { bg->boost_max = boost; - bg->boost_ts = bg->group[idx].ts; trace_sched_tune_boostgroup_update(cpu, 1, bg->boost_max); continue; @@ -330,7 +325,7 @@ schedtune_boostgroup_update(int idx, int boost) /* Check if this update has decreased current max */ if (cur_boost_max == old_boost && old_boost > boost) { - schedtune_cpu_update(cpu, now); + schedtune_cpu_update(cpu); trace_sched_tune_boostgroup_update(cpu, -1, bg->boost_max); continue; } @@ -362,20 +357,13 @@ schedtune_tasks_update(struct task_struct *p, int cpu, int idx, int task_count) /* Update boosted tasks count while avoiding to make it negative */ bg->group[idx].tasks = max(0, tasks); - /* Update timeout on enqueue */ - if (task_count > 0) { - now = sched_clock_cpu(cpu); - if (schedtune_update_timestamp(p)) - bg->group[idx].ts = now; - - /* Boost group activation or deactivation on that RQ */ - if (bg->group[idx].tasks == 1) - schedtune_cpu_update(cpu, now); - } trace_sched_tune_tasks_update(p, cpu, tasks, idx, - bg->group[idx].boost, bg->boost_max, - bg->group[idx].ts); + bg->group[idx].boost, bg->boost_max); + + /* Boost group activation or deactivation on that RQ */ + if (tasks == 1 || tasks == 0) + schedtune_cpu_update(cpu); } /* From b9b98e6e02d537a9a51a34d8f55534b2a32cce24 Mon Sep 17 00:00:00 2001 From: UchihaDev Date: Wed, 2 May 2018 21:39:40 +0545 Subject: [PATCH 145/146] Revert "cpu-boost: Implement Dynamic SchedTune Boost v3" This reverts commit 161c2150ee126298cdd4bb47cf75f7d8f61517c7. --- drivers/cpufreq/cpu-boost.c | 20 -------------------- 1 file changed, 20 deletions(-) diff --git a/drivers/cpufreq/cpu-boost.c b/drivers/cpufreq/cpu-boost.c index e980faae2..d59d44c31 100644 --- a/drivers/cpufreq/cpu-boost.c +++ b/drivers/cpufreq/cpu-boost.c @@ -40,11 +40,6 @@ module_param(input_boost_enabled, uint, 0644); static unsigned int input_boost_ms = 40; module_param(input_boost_ms, uint, 0644); -#ifdef CONFIG_DYNAMIC_STUNE_BOOST -static int dynamic_stune_boost; -module_param(dynamic_stune_boost, uint, 0644); -#endif /* CONFIG_DYNAMIC_STUNE_BOOST */ - static struct delayed_work input_boost_rem; static u64 last_input_time; #define MIN_INPUT_INTERVAL (150 * USEC_PER_MSEC) @@ -174,11 +169,6 @@ static void do_input_boost_rem(struct work_struct *work) i_sync_info->input_boost_min = 0; } -#ifdef CONFIG_DYNAMIC_STUNE_BOOST - /* Reset dynamic stune boost value to the default value */ - reset_stune_boost("top-app"); -#endif /* CONFIG_DYNAMIC_STUNE_BOOST */ - /* Update policies for all online CPUs */ update_policy_online(); @@ -201,11 +191,6 @@ static void do_input_boost(struct work_struct *work) /* Update policies for all online CPUs */ update_policy_online(); -#ifdef CONFIG_DYNAMIC_STUNE_BOOST - /* Set dynamic stune boost value */ - do_stune_boost("top-app", dynamic_stune_boost); -#endif /* CONFIG_DYNAMIC_STUNE_BOOST */ - queue_delayed_work(cpu_boost_wq, &input_boost_rem, msecs_to_jiffies(input_boost_ms)); } @@ -261,11 +246,6 @@ static int cpuboost_input_connect(struct input_handler *handler, static void cpuboost_input_disconnect(struct input_handle *handle) { -#ifdef CONFIG_DYNAMIC_STUNE_BOOST - /* Reset dynamic stune boost value to the default value */ - reset_stune_boost("top-app"); -#endif /* CONFIG_DYNAMIC_STUNE_BOOST */ - input_close_device(handle); input_unregister_handle(handle); kfree(handle); From cf2ad3379ff2644157bb00de09750520288ed71b Mon Sep 17 00:00:00 2001 From: UchihaDev Date: Wed, 2 May 2018 22:06:59 +0545 Subject: [PATCH 146/146] Kernel/sched rebase complete --- arch/arm64/configs/hyperplus_defconfig | 2 +- arch/arm64/kernel/topology.c | 91 +------------- include/linux/sched.h | 5 +- include/linux/sched/sysctl.h | 4 - include/trace/events/sched.h | 157 ++++++++++++------------- kernel/hwcfs/hwcfs_common.c | 4 - kernel/sched/tune.c | 114 +----------------- kernel/signal.c | 23 ---- kernel/sysctl.c | 9 -- 9 files changed, 80 insertions(+), 329 deletions(-) diff --git a/arch/arm64/configs/hyperplus_defconfig b/arch/arm64/configs/hyperplus_defconfig index 8f687ebfd..5315b5d97 100644 --- a/arch/arm64/configs/hyperplus_defconfig +++ b/arch/arm64/configs/hyperplus_defconfig @@ -164,7 +164,7 @@ CONFIG_BLK_CGROUP=y # CONFIG_DEBUG_BLK_CGROUP is not set # CONFIG_CHECKPOINT_RESTORE is not set # CONFIG_NAMESPACES is not set -CONFIG_HISI_EAS_SCHED=y +# CONFIG_HISI_EAS_SCHED CONFIG_HISI_RT_OPT=y # CONFIG_SCHED_AUTOGROUP is not set CONFIG_SCHED_TUNE=y diff --git a/arch/arm64/kernel/topology.c b/arch/arm64/kernel/topology.c index e55824012..7cdb06c4b 100644 --- a/arch/arm64/kernel/topology.c +++ b/arch/arm64/kernel/topology.c @@ -221,92 +221,6 @@ static int __init parse_dt_topology(void) struct cpu_topology cpu_topology[NR_CPUS]; EXPORT_SYMBOL_GPL(cpu_topology); -#ifdef CONFIG_HISI_EAS_SCHED -static const char * const little_cores[] = { - "arm,cortex-a53", - NULL, -}; - -static bool is_little_cpu(struct device_node *cn) -{ - const char * const *lc; - for (lc = little_cores; *lc; lc++) - if (of_device_is_compatible(cn, *lc)) - return true; - return false; -} - -void __init arch_get_fast_and_slow_cpus(struct cpumask *fast, - struct cpumask *slow) -{ - struct device_node *cn = NULL; - int cpu; - - cpumask_clear(fast); - cpumask_clear(slow); - - /* - * Else, parse device tree for little cores. - */ - while ((cn = of_find_node_by_type(cn, "cpu"))) { - const u32 *mpidr; - int len; - - mpidr = of_get_property(cn, "reg", &len); - if (!mpidr || len != 8) { - pr_err("%s missing reg property\n", cn->full_name); - continue; - } - - cpu = get_logical_index(be32_to_cpup(mpidr+1)); - if (cpu == -EINVAL) { - pr_err("couldn't get logical index for mpidr %x\n", - be32_to_cpup(mpidr+1)); - break; - } - - if (is_little_cpu(cn)) - cpumask_set_cpu(cpu, slow); - else - cpumask_set_cpu(cpu, fast); - } - - if (!cpumask_empty(fast) && !cpumask_empty(slow)) - return; - - /* - * We didn't find both big and little cores so let's call all cores - * fast as this will keep the system running, with all cores being - * treated equal. - */ - cpumask_setall(fast); - cpumask_clear(slow); -} - -struct cpumask slow_cpu_mask; -struct cpumask fast_cpu_mask; -void hisi_get_fast_cpus(struct cpumask *cpumask) -{ - cpumask_copy(cpumask, &fast_cpu_mask); -} -EXPORT_SYMBOL(hisi_get_fast_cpus); - -void hisi_get_slow_cpus(struct cpumask *cpumask) -{ - cpumask_copy(cpumask, &slow_cpu_mask); -} -EXPORT_SYMBOL(hisi_get_slow_cpus); - -int hisi_test_fast_cpu(int cpu) -{ - if (cpumask_test_cpu(cpu, &fast_cpu_mask)) - return 1; - else - return 0; -} -EXPORT_SYMBOL(hisi_test_fast_cpu); -#endif - /* sd energy functions */ static inline const struct sched_group_energy * const cpu_cluster_energy(int cpu) @@ -369,7 +283,7 @@ static void update_cpu_capacity(unsigned int cpu) set_capacity_scale(cpu, capacity); - pr_debug("CPU%d: update cpu_capacity %lu\n", + pr_info("CPU%d: update cpu_capacity %lu\n", cpu, arch_scale_cpu_capacity(NULL, cpu)); } @@ -470,7 +384,4 @@ void __init init_cpu_topology(void) init_sched_energy_costs(); -#ifdef CONFIG_HISI_EAS_SCHED - arch_get_fast_and_slow_cpus(&fast_cpu_mask, &slow_cpu_mask); -#endif } diff --git a/include/linux/sched.h b/include/linux/sched.h index 2f4953c80..69f366621 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -233,9 +233,10 @@ extern void proc_sched_set_task(struct task_struct *p); #define TASK_WAKING 256 #define TASK_PARKED 512 #define TASK_NOLOAD 1024 -#define TASK_STATE_MAX 2048 +#define TASK_NEW 2048 +#define TASK_STATE_MAX 4096 -#define TASK_STATE_TO_CHAR_STR "RSDTtXZxKWPN" +#define TASK_STATE_TO_CHAR_STR "RSDTtXZxKWPNn" extern char ___assert_task_state[1 - 2*!!( sizeof(TASK_STATE_TO_CHAR_STR)-1 != ilog2(TASK_STATE_MAX)+1)]; diff --git a/include/linux/sched/sysctl.h b/include/linux/sched/sysctl.h index 0a2535647..5e566733c 100644 --- a/include/linux/sched/sysctl.h +++ b/include/linux/sched/sysctl.h @@ -35,10 +35,6 @@ enum { sysctl_hung_task_timeout_secs = 0 }; extern int sysctl_max_map_count; -#ifdef CONFIG_BOOST_KILL -extern unsigned int sysctl_boost_killing; -#endif - #ifdef CONFIG_HW_VIP_THREAD #include #endif diff --git a/include/trace/events/sched.h b/include/trace/events/sched.h index 920ae6f96..99ce03357 100644 --- a/include/trace/events/sched.h +++ b/include/trace/events/sched.h @@ -769,9 +769,9 @@ TRACE_EVENT(sched_load_avg_cpu, __entry->util_avg_pelt = cfs_rq->avg.util_avg; __entry->util_avg_walt = 0; #ifdef CONFIG_SCHED_WALT - __entry->util_avg_walt = - cpu_rq(cpu)->prev_runnable_sum << SCHED_LOAD_SHIFT; - do_div(__entry->util_avg_walt, walt_ravg_window); + __entry->util_avg_walt = + div64_u64(cpu_rq(cpu)->cumulative_runnable_avg, + walt_ravg_window >> SCHED_LOAD_SHIFT); if (!walt_disabled && sysctl_sched_use_walt_cpu_util) __entry->util_avg = __entry->util_avg_walt; #endif @@ -783,81 +783,24 @@ TRACE_EVENT(sched_load_avg_cpu, __entry->util_avg_pelt, __entry->util_avg_walt) ); -/* - * Tracepoint for eas attribute store - */ -TRACE_EVENT(eas_attr_store, - - TP_PROTO(const char *name, int value), - - TP_ARGS(name, value), - - TP_STRUCT__entry( - __array( char, name, TASK_COMM_LEN ) - __field( int, value ) - ), - - TP_fast_assign( - memcpy(__entry->name, name, TASK_COMM_LEN); - __entry->value = value; - ), - - TP_printk("name=%s value=%d", __entry->name, __entry->value) -); - -/* - * Tracepoint for schedtune_boost - */ -TRACE_EVENT(sched_tune_boost, - - TP_PROTO(const char *name, int boost), - - TP_ARGS(name, boost), - - TP_STRUCT__entry( - __array( char, name, TASK_COMM_LEN ) - __field( int, boost ) - ), - - TP_fast_assign( - memcpy(__entry->name, name, TASK_COMM_LEN); - __entry->boost = boost; - ), - - TP_printk("name=%s boost=%d", __entry->name, __entry->boost) -); - /* * Tracepoint for sched_tune_config settings */ TRACE_EVENT(sched_tune_config, - TP_PROTO(int boost, int pb_nrg_gain, int pb_cap_gain, int pc_nrg_gain, int pc_cap_gain), + TP_PROTO(int boost), - TP_ARGS(boost, pb_nrg_gain, pb_cap_gain, pc_nrg_gain, pc_cap_gain), + TP_ARGS(boost), TP_STRUCT__entry( __field( int, boost ) - __field( int, pb_nrg_gain ) - __field( int, pb_cap_gain ) - __field( int, pc_nrg_gain ) - __field( int, pc_cap_gain ) ), TP_fast_assign( __entry->boost = boost; - __entry->pb_nrg_gain = pb_nrg_gain; - __entry->pb_cap_gain = pb_cap_gain; - __entry->pc_nrg_gain = pc_nrg_gain; - __entry->pc_cap_gain = pc_cap_gain; - ), - - TP_printk("boost=%d " - "pb_nrg_gain=%d pb_cap_gain=%d " - "pc_nrg_gain=%d pc_cap_gain=%d", - __entry->boost, - __entry->pb_nrg_gain, __entry->pb_cap_gain, - __entry->pc_nrg_gain, __entry->pc_cap_gain) + ), + + TP_printk("boost=%d ", __entry->boost) ); /* @@ -893,9 +836,9 @@ TRACE_EVENT(sched_boost_cpu, TRACE_EVENT(sched_tune_tasks_update, TP_PROTO(struct task_struct *tsk, int cpu, int tasks, int idx, - int boost, int max_boost, u64 group_ts), + int boost, int max_boost), - TP_ARGS(tsk, cpu, tasks, idx, boost, max_boost, group_ts), + TP_ARGS(tsk, cpu, tasks, idx, boost, max_boost), TP_STRUCT__entry( __array( char, comm, TASK_COMM_LEN ) @@ -905,7 +848,6 @@ TRACE_EVENT(sched_tune_tasks_update, __field( int, idx ) __field( int, boost ) __field( int, max_boost ) - __field( u64, group_ts ) ), TP_fast_assign( @@ -916,15 +858,13 @@ TRACE_EVENT(sched_tune_tasks_update, __entry->idx = idx; __entry->boost = boost; __entry->max_boost = max_boost; - __entry->group_ts = group_ts; ), TP_printk("pid=%d comm=%s " - "cpu=%d tasks=%d idx=%d boost=%d max_boost=%d timeout=%llu", + "cpu=%d tasks=%d idx=%d boost=%d max_boost=%d", __entry->pid, __entry->comm, __entry->cpu, __entry->tasks, __entry->idx, - __entry->boost, __entry->max_boost, - __entry->group_ts) + __entry->boost, __entry->max_boost) ); /* @@ -989,11 +929,9 @@ TRACE_EVENT(sched_find_best_target, TP_PROTO(struct task_struct *tsk, bool prefer_idle, unsigned long min_util, int start_cpu, - bool low_util_mode, int low_util_cpu, int best_idle, int best_active, int target), TP_ARGS(tsk, prefer_idle, min_util, start_cpu, - low_util_mode, low_util_cpu, best_idle, best_active, target), TP_STRUCT__entry( @@ -1002,8 +940,6 @@ TRACE_EVENT(sched_find_best_target, __field( unsigned long, min_util ) __field( bool, prefer_idle ) __field( int, start_cpu ) - __field( bool, low_util_mode ) - __field( int, low_util_cpu ) __field( int, best_idle ) __field( int, best_active ) __field( int, target ) @@ -1015,23 +951,76 @@ TRACE_EVENT(sched_find_best_target, __entry->min_util = min_util; __entry->prefer_idle = prefer_idle; __entry->start_cpu = start_cpu; - __entry->low_util_mode = low_util_mode; - __entry->low_util_cpu = low_util_cpu; __entry->best_idle = best_idle; __entry->best_active = best_active; __entry->target = target; ), TP_printk("pid=%d comm=%s prefer_idle=%d start_cpu=%d " - "low_util_mode=%d, low_util_cpu=%d " "best_idle=%d best_active=%d target=%d", __entry->pid, __entry->comm, __entry->prefer_idle, __entry->start_cpu, - __entry->low_util_mode, __entry->low_util_cpu, __entry->best_idle, __entry->best_active, __entry->target) ); +/* + * Tracepoint for accounting sched group energy + */ +TRACE_EVENT(sched_energy_diff, + + TP_PROTO(struct task_struct *tsk, int scpu, int dcpu, int udelta, + int nrgb, int nrga, int nrgd, int capb, int capa, int capd, + int nrgn, int nrgp), + + TP_ARGS(tsk, scpu, dcpu, udelta, + nrgb, nrga, nrgd, capb, capa, capd, + nrgn, nrgp), + + TP_STRUCT__entry( + __array( char, comm, TASK_COMM_LEN ) + __field( pid_t, pid ) + __field( int, scpu ) + __field( int, dcpu ) + __field( int, udelta ) + __field( int, nrgb ) + __field( int, nrga ) + __field( int, nrgd ) + __field( int, capb ) + __field( int, capa ) + __field( int, capd ) + __field( int, nrgn ) + __field( int, nrgp ) + ), + + TP_fast_assign( + memcpy(__entry->comm, tsk->comm, TASK_COMM_LEN); + __entry->pid = tsk->pid; + __entry->scpu = scpu; + __entry->dcpu = dcpu; + __entry->udelta = udelta; + __entry->nrgb = nrgb; + __entry->nrga = nrga; + __entry->nrgd = nrgd; + __entry->capb = capb; + __entry->capa = capa; + __entry->capd = capd; + __entry->nrgn = nrgn; + __entry->nrgp = nrgp; + ), + + TP_printk("pid=%d comm=%s " + "src_cpu=%d dst_cpu=%d usage_delta=%d " + "nrg_before=%d nrg_after=%d nrg_diff=%d " + "cap_before=%d cap_after=%d cap_delta=%d " + "nrg_delta=%d nrg_payoff=%d", + __entry->pid, __entry->comm, + __entry->scpu, __entry->dcpu, __entry->udelta, + __entry->nrgb, __entry->nrga, __entry->nrgd, + __entry->capb, __entry->capa, __entry->capd, + __entry->nrgn, __entry->nrgp) +); + /* * Tracepoint for schedtune_tasks_update */ @@ -1113,6 +1102,7 @@ TRACE_EVENT(walt_update_task_ravg, __field( int, cpu ) __field( u64, cs ) __field( u64, ps ) + __field(unsigned long, util ) __field( u32, curr_window ) __field( u32, prev_window ) __field( u64, nt_cs ) @@ -1136,6 +1126,8 @@ TRACE_EVENT(walt_update_task_ravg, __entry->irqtime = irqtime; __entry->cs = rq->curr_runnable_sum; __entry->ps = rq->prev_runnable_sum; + __entry->util = rq->prev_runnable_sum << SCHED_LOAD_SHIFT; + do_div(__entry->util, walt_ravg_window); __entry->curr_window = p->ravg.curr_window; __entry->prev_window = p->ravg.prev_window; __entry->nt_cs = rq->nt_curr_runnable_sum; @@ -1144,15 +1136,14 @@ TRACE_EVENT(walt_update_task_ravg, ), TP_printk("wc %llu ws %llu delta %llu event %d cpu %d cur_pid %d task %d (%s) ms %llu delta %llu demand %u sum %u irqtime %llu" - " cs %llu ps %llu cur_window %u prev_window %u nt_cs %llu nt_ps %llu active_wins %u" + " cs %llu ps %llu util %lu cur_window %u prev_window %u active_wins %u" , __entry->wallclock, __entry->win_start, __entry->delta, __entry->evt, __entry->cpu, __entry->cur_pid, __entry->pid, __entry->comm, __entry->mark_start, __entry->delta_m, __entry->demand, __entry->sum, __entry->irqtime, - __entry->cs, __entry->ps, + __entry->cs, __entry->ps, __entry->util, __entry->curr_window, __entry->prev_window, - __entry->nt_cs, __entry->nt_ps, __entry->active_windows ) ); diff --git a/kernel/hwcfs/hwcfs_common.c b/kernel/hwcfs/hwcfs_common.c index a22047497..9d615bfee 100644 --- a/kernel/hwcfs/hwcfs_common.c +++ b/kernel/hwcfs/hwcfs_common.c @@ -327,10 +327,6 @@ static int vip_can_migrate(struct task_struct *p, struct rq *src_rq, struct rq * return 1; } -extern void hisi_get_fast_cpus(struct cpumask *cpumask); -extern void hisi_get_slow_cpus(struct cpumask *cpumask); -static struct cpumask hisi_slow_cpu_mask; - static int __do_vip_balance(void *data) { struct rq *src_rq = data; diff --git a/kernel/sched/tune.c b/kernel/sched/tune.c index 31de5d56f..ba4cd9f23 100644 --- a/kernel/sched/tune.c +++ b/kernel/sched/tune.c @@ -17,17 +17,8 @@ bool schedtune_initialized = false; unsigned int sysctl_sched_cfs_boost __read_mostly; -/* We hold schedtune boost in effect for at least this long */ -#define SCHEDTUNE_BOOST_HOLD_NS 50000000ULL - extern struct reciprocal_value schedtune_spc_rdiv; -struct target_nrg schedtune_target_nrg; - -#ifdef CONFIG_DYNAMIC_STUNE_BOOST -static DEFINE_MUTEX(stune_boost_mutex); -static struct schedtune *getSchedtune(char *st_name); -static int dynamic_boost_write(struct schedtune *st, int boost); -#endif /* CONFIG_DYNAMIC_STUNE_BOOST */ +extern struct target_nrg schedtune_target_nrg; /* Performance Boost region (B) threshold params */ static int perf_boost_idx; @@ -140,13 +131,6 @@ struct schedtune { * towards idle CPUs */ int prefer_idle; -#ifdef CONFIG_DYNAMIC_STUNE_BOOST - /* - * This tracks the default boost value and is used to restore - * the value when Dynamic SchedTune Boost is reset. - */ - int boost_default; -#endif /* CONFIG_DYNAMIC_STUNE_BOOST */ }; static inline struct schedtune *css_st(struct cgroup_subsys_state *css) @@ -179,9 +163,6 @@ root_schedtune = { .perf_boost_idx = 0, .perf_constrain_idx = 0, .prefer_idle = 0, -#ifdef CONFIG_DYNAMIC_STUNE_BOOST - .boost_default = 0, -#endif /* CONFIG_DYNAMIC_STUNE_BOOST */ }; int @@ -268,8 +249,6 @@ schedtune_cpu_update(int cpu) bg = &per_cpu(cpu_boost_groups, cpu); - for (idx = 0; idx < BOOSTGROUPS_COUNT; ++idx) { - /* The root boost group is always active */ boost_max = bg->group[0].boost; for (idx = 1; idx < BOOSTGROUPS_COUNT; ++idx) { @@ -339,21 +318,11 @@ schedtune_boostgroup_update(int idx, int boost) #define ENQUEUE_TASK 1 #define DEQUEUE_TASK -1 -static inline bool -schedtune_update_timestamp(struct task_struct *p) -{ - if (sched_feat(SCHEDTUNE_BOOST_HOLD_ALL)) - return true; - - return task_has_rt_policy(p); -} - static inline void schedtune_tasks_update(struct task_struct *p, int cpu, int idx, int task_count) { struct boost_groups *bg = &per_cpu(cpu_boost_groups, cpu); int tasks = bg->group[idx].tasks + task_count; - u64 now; /* Update boosted tasks count while avoiding to make it negative */ bg->group[idx].tasks = max(0, tasks); @@ -405,12 +374,6 @@ void schedtune_enqueue_task(struct task_struct *p, int cpu) raw_spin_unlock_irqrestore(&bg->lock, irq_flags); } -int schedtune_allow_attach(struct cgroup_taskset *tset) -{ - /* We always allows tasks to be moved between existing CGroups */ - return 0; -} - int schedtune_can_attach(struct cgroup_taskset *tset) { struct task_struct *task; @@ -422,7 +385,6 @@ int schedtune_can_attach(struct cgroup_taskset *tset) int src_bg; /* Source boost group index */ int dst_bg; /* Destination boost group index */ int tasks; - u64 now; if (!unlikely(schedtune_initialized)) return 0; @@ -649,9 +611,6 @@ boost_write(struct cgroup_subsys_state *css, struct cftype *cft, st->perf_constrain_idx = threshold_idx; st->boost = boost; -#ifdef CONFIG_DYNAMIC_STUNE_BOOST - st->boost_default = boost; -#endif /* CONFIG_DYNAMIC_STUNE_BOOST */ if (css == &root_schedtune.css) { sysctl_sched_cfs_boost = boost; perf_boost_idx = threshold_idx; @@ -787,77 +746,6 @@ schedtune_init_cgroups(void) schedtune_initialized = true; } -#ifdef CONFIG_DYNAMIC_STUNE_BOOST -static struct schedtune *getSchedtune(char *st_name) -{ - int idx; - - for (idx = 1; idx < BOOSTGROUPS_COUNT; ++idx) { - char name_buf[NAME_MAX + 1]; - struct schedtune *st = allocated_group[idx]; - - if (!st) { - pr_warn("SCHEDTUNE: Could not find %s\n", st_name); - break; - } - - cgroup_name(st->css.cgroup, name_buf, sizeof(name_buf)); - if (strncmp(name_buf, st_name, strlen(st_name)) == 0) - return st; - } - - return NULL; -} - -static int dynamic_boost_write(struct schedtune *st, int boost) -{ - int ret; - /* Backup boost_default */ - int boost_default_backup = st->boost_default; - - ret = boost_write(&st->css, NULL, boost); - - /* Restore boost_default */ - st->boost_default = boost_default_backup; - - return ret; -} - -int do_stune_boost(char *st_name, int boost) -{ - int ret = 0; - struct schedtune *st = getSchedtune(st_name); - - if (!st) - return -EINVAL; - - mutex_lock(&stune_boost_mutex); - - /* Boost if new value is greater than current */ - if (boost > st->boost) - ret = dynamic_boost_write(st, boost); - - mutex_unlock(&stune_boost_mutex); - - return ret; -} - -int reset_stune_boost(char *st_name) -{ - int ret = 0; - struct schedtune *st = getSchedtune(st_name); - - if (!st) - return -EINVAL; - - mutex_lock(&stune_boost_mutex); - ret = dynamic_boost_write(st, st->boost_default); - mutex_unlock(&stune_boost_mutex); - - return ret; -} -#endif /* CONFIG_DYNAMIC_STUNE_BOOST */ - #else /* CONFIG_CGROUP_SCHEDTUNE */ int diff --git a/kernel/signal.c b/kernel/signal.c index cf485a9d8..47833c269 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -49,15 +49,6 @@ #include #endif -#ifdef CONFIG_BOOST_KILL -extern void hisi_get_fast_cpus(struct cpumask *cpumask); - -/* Add apportunity to config enable/disable boost - * killing action - */ -unsigned int sysctl_boost_killing; -#endif - /* * SLAB caches for signal bits. */ @@ -894,11 +885,6 @@ static void complete_signal(int sig, struct task_struct *p, int group) { struct signal_struct *signal = p->signal; struct task_struct *t; -/*lint -save -e504*/ -#ifdef CONFIG_BOOST_KILL - cpumask_t new_mask = CPU_MASK_ALL; -#endif -/*lint -restore*/ /* * Now find a thread we can wake up to take the signal off the queue. @@ -955,15 +941,6 @@ static void complete_signal(int sig, struct task_struct *p, int group) signal->group_stop_count = 0; t = p; do { -#ifdef CONFIG_BOOST_KILL - if (sysctl_boost_killing) { - if (can_nice(t, -20)) - set_user_nice(t, -20); - hisi_get_fast_cpus(&new_mask); - cpumask_copy(&t->cpus_allowed, &new_mask); - t->nr_cpus_allowed = cpumask_weight(&new_mask); - } -#endif task_clear_jobctl_pending(t, JOBCTL_PENDING_MASK); sigaddset(&t->pending.signal, SIGKILL); signal_wake_up(t, 1); diff --git a/kernel/sysctl.c b/kernel/sysctl.c index bd09be3e2..19e978405 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -334,15 +334,6 @@ static struct ctl_table kern_table[] = { .mode = 0644, .proc_handler = proc_dointvec, }, -#ifdef CONFIG_BOOST_KILL - { - .procname = "boost_killing", - .data = &sysctl_boost_killing, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec, - }, -#endif #ifdef CONFIG_HW_VIP_THREAD { .procname = "vip_min_sched_delay_granularity",